From 0f53f2993e8fba18a93d9a80be85a44e7c756553 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 14 Nov 2011 09:02:06 +0000
Subject: [PATCH] removed BEGIN_OPENCV_DEVICE_NAMESPACE macros

---
 modules/gpu/src/arithm.cpp                         |   19 +-
 modules/gpu/src/bilateral_filter.cpp               |   19 +-
 modules/gpu/src/blend.cpp                          |   19 +-
 modules/gpu/src/brute_force_matcher.cpp            |  153 +-
 modules/gpu/src/calib3d.cpp                        |   39 +-
 modules/gpu/src/color.cpp                          |  275 +-
 modules/gpu/src/cuda/bf_knnmatch.cu                | 1816 +++++-----
 modules/gpu/src/cuda/bf_match.cu                   | 1190 ++++---
 modules/gpu/src/cuda/bf_radius_match.cu            |  712 ++--
 modules/gpu/src/cuda/bilateral_filter.cu           |  308 +-
 modules/gpu/src/cuda/blend.cu                      |  124 +-
 modules/gpu/src/cuda/calib3d.cu                    |  243 +-
 modules/gpu/src/cuda/canny.cu                      |  690 ++--
 modules/gpu/src/cuda/color.cu                      |  621 ++--
 modules/gpu/src/cuda/column_filter.cu              |  352 +-
 modules/gpu/src/cuda/copy_make_border.cu           |  162 +-
 modules/gpu/src/cuda/element_operations.cu         | 3642 ++++++++++----------
 modules/gpu/src/cuda/hist.cu                       |  284 +-
 modules/gpu/src/cuda/hog.cu                        | 1274 ++++---
 modules/gpu/src/cuda/imgproc.cu                    | 1568 +++++----
 modules/gpu/src/cuda/internal_shared.hpp           |  111 +-
 modules/gpu/src/cuda/match_template.cu             | 1536 +++++----
 modules/gpu/src/cuda/mathfunc.cu                   |  294 +-
 modules/gpu/src/cuda/matrix_operations.cu          |  495 ++-
 modules/gpu/src/cuda/matrix_reductions.cu          | 3408 +++++++++---------
 modules/gpu/src/cuda/pyr_down.cu                   |  260 +-
 modules/gpu/src/cuda/pyr_up.cu                     |  200 +-
 modules/gpu/src/cuda/remap.cu                      |  398 ++-
 modules/gpu/src/cuda/resize.cu                     |  428 ++-
 modules/gpu/src/cuda/row_filter.cu                 |  388 ++-
 modules/gpu/src/cuda/safe_call.hpp                 |   63 +-
 modules/gpu/src/cuda/split_merge.cu                |  922 +++--
 modules/gpu/src/cuda/stereobm.cu                   |  800 +++--
 modules/gpu/src/cuda/stereobp.cu                   |  782 +++--
 modules/gpu/src/cuda/stereocsbp.cu                 | 1334 ++++---
 modules/gpu/src/cuda/surf.cu                       | 1570 +++++----
 modules/gpu/src/cudastream.cpp                     |   21 +-
 modules/gpu/src/element_operations.cpp             |  250 +-
 modules/gpu/src/filtering.cpp                      |   29 +-
 modules/gpu/src/hog.cpp                            |   71 +-
 modules/gpu/src/imgproc.cpp                        |  341 +-
 modules/gpu/src/initialization.cpp                 |   29 +-
 modules/gpu/src/match_template.cpp                 |  163 +-
 modules/gpu/src/matrix_reductions.cpp              |  162 +-
 .../src/opencv2/gpu/device/border_interpolate.hpp  | 1141 +++---
 modules/gpu/src/opencv2/gpu/device/color.hpp       |  345 +-
 .../gpu/src/opencv2/gpu/device/datamov_utils.hpp   |   46 +-
 .../src/opencv2/gpu/device/detail/color_detail.hpp | 1787 +++++-----
 .../opencv2/gpu/device/detail/transform_detail.hpp |  577 ++--
 .../gpu/device/detail/type_traits_detail.hpp       |  257 +-
 .../opencv2/gpu/device/detail/utility_detail.hpp   | 1255 ++++---
 .../gpu/device/detail/vec_distance_detail.hpp      |  103 +-
 modules/gpu/src/opencv2/gpu/device/emulation.hpp   |   33 +-
 modules/gpu/src/opencv2/gpu/device/filters.hpp     |  131 +-
 modules/gpu/src/opencv2/gpu/device/funcattrib.hpp  |   39 +-
 modules/gpu/src/opencv2/gpu/device/functional.hpp  |  742 ++--
 modules/gpu/src/opencv2/gpu/device/limits.hpp      |  370 +-
 .../gpu/src/opencv2/gpu/device/saturate_cast.hpp   |  319 +-
 .../gpu/src/opencv2/gpu/device/static_check.hpp    |   29 +-
 modules/gpu/src/opencv2/gpu/device/transform.hpp   |   45 +-
 modules/gpu/src/opencv2/gpu/device/type_traits.hpp |   55 +-
 modules/gpu/src/opencv2/gpu/device/utility.hpp     |  255 +-
 .../gpu/src/opencv2/gpu/device/vec_distance.hpp    |  263 +-
 modules/gpu/src/opencv2/gpu/device/vec_math.hpp    |  295 +-
 modules/gpu/src/opencv2/gpu/device/vec_traits.hpp  |  319 +-
 modules/gpu/src/opencv2/gpu/device/warp.hpp        |  111 +-
 modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp |   39 +-
 modules/gpu/src/split_merge.cpp                    |   21 +-
 modules/gpu/src/stereobm.cpp                       |   19 +-
 modules/gpu/src/stereobp.cpp                       |   41 +-
 modules/gpu/src/stereocsbp.cpp                     |   63 +-
 modules/gpu/src/surf.cpp                           |   41 +-
 modules/gpu/test/test_video.cpp                    |    2 +-
 73 files changed, 18038 insertions(+), 18270 deletions(-)

diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index a47d222..1f40156 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -425,21 +425,20 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace mathfunc 
+namespace cv { namespace gpu { namespace device 
 {
-    void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-    void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace mathfunc 
+    {
+        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+    }
+}}}
 
 namespace
 {
     inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+        using namespace ::cv::gpu::device::mathfunc;
 
         CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
         CV_Assert(x.depth() == CV_32F);
@@ -459,7 +458,7 @@ namespace
 
     inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+        using namespace ::cv::gpu::device::mathfunc;
 
         CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
         CV_Assert(mag.depth() == CV_32F);
diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp
index 12c159a..d24adee 100644
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -55,19 +55,18 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bilateral_filter
+namespace cv { namespace gpu { namespace device 
 {
-    void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-    void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
-    void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
-}
+    namespace bilateral_filter
+    {
+        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;
+using namespace ::cv::gpu::device::bilateral_filter;
 
 namespace
 {
diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp
index 4c4afc5..7c2a86e 100644
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -52,19 +52,18 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu
 
 #else
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace blend
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T>
-    void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
-
-    void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-}
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE_ blend;
+using namespace ::cv::gpu::device::blend;
 
 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
                           GpuMat& result, Stream& stream)
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 1d93146..7f11282 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -82,80 +82,79 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bf_match
-{
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-        int cc, cudaStream_t stream);
-
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
-        int cc, cudaStream_t stream);
-}
-
-namespace bf_knnmatch
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-        int cc, cudaStream_t stream);
-
-    template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-        int cc, cudaStream_t stream);
-    template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-        int cc, cudaStream_t stream);
-}
+    namespace bf_match
+    {
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+            int cc, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
+            int cc, cudaStream_t stream);
+    }
 
-namespace bf_radius_match 
-{
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream);
-}
+    namespace bf_knnmatch
+    {
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream);
+
+        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream);
+        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream);
+    }
 
-END_OPENCV_DEVICE_NAMESPACE
+    namespace bf_radius_match 
+    {
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream);
+    }
+}}}
 
 ////////////////////////////////////////////////////////////////////
 // Train collection
@@ -199,7 +198,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
     if (query.empty() || train.empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
+    using namespace ::cv::gpu::device::bf_match;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
                              const DevMem2Di& trainIdx, const DevMem2Df& distance,
@@ -341,7 +340,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
     if (query.empty() || trainCollection.empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
+    using namespace ::cv::gpu::device::bf_match;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                              const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
@@ -452,7 +451,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
     if (query.empty() || train.empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
                              const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@@ -581,7 +580,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
     if (query.empty() || trainCollection.empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                              const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
@@ -762,7 +761,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
     if (query.empty() || train.empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
+    using namespace ::cv::gpu::device::bf_radius_match;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
                              const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@@ -893,7 +892,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
     if (query.empty() || empty())
         return;
 
-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
+    using namespace ::cv::gpu::device::bf_radius_match;
 
     typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
                              const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp
index 8e6e838..bc522f3 100644
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -56,31 +56,30 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat
 
 #else
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace transform_points 
-{
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
-}
-
-namespace project_points 
+namespace cv { namespace gpu { namespace device 
 {
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
-}
+    namespace transform_points 
+    {
+        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
+    }
 
-namespace solve_pnp_ransac
-{
-    int maxNumIters();
+    namespace project_points 
+    {
+        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
+    }
 
-    void computeHypothesisScores(
-            const int num_hypotheses, const int num_points, const float* rot_matrices,
-            const float3* transl_vectors, const float3* object, const float2* image,
-            const float dist_threshold, int* hypothesis_scores);
-}
+    namespace solve_pnp_ransac
+    {
+        int maxNumIters();
 
-END_OPENCV_DEVICE_NAMESPACE
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE;
+using namespace ::cv::gpu::device;
 
 namespace
 {
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index c4f8b60..d52d797 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -51,8 +51,8 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
+namespace cv { namespace gpu { namespace device 
+{
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
     void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
 
@@ -67,142 +67,141 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
     OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
     OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
 
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
-
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
-OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
-
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-
-END_OPENCV_DEVICE_NAMESPACE
-
-using namespace OPENCV_DEVICE_NAMESPACE;
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+}}}
+
+using namespace ::cv::gpu::device;
 
 namespace
 {
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
index c8b1171..f59cef0 100644
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -45,1117 +45,1115 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bf_knnmatch {
-
-///////////////////////////////////////////////////////////////////////////////
-// Reduction
-
-template <int BLOCK_SIZE> 
-__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
-                              int& bestTrainIdx1, int& bestTrainIdx2, 
-                              float* s_distance, int* s_trainIdx)
+namespace cv { namespace gpu { namespace device 
 {
-    float myBestDistance1 = numeric_limits<float>::max(); 
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
+    namespace bf_knnmatch 
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
 
-    s_distance += threadIdx.y * BLOCK_SIZE;
-    s_trainIdx += threadIdx.y * BLOCK_SIZE;
+        template <int BLOCK_SIZE> 
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
+                                      int& bestTrainIdx1, int& bestTrainIdx2, 
+                                      float* s_distance, int* s_trainIdx)
+        {
+            float myBestDistance1 = numeric_limits<float>::max(); 
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
 
-    s_distance[threadIdx.x] = bestDistance1;
-    s_trainIdx[threadIdx.x] = bestTrainIdx1;
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
 
-    __syncthreads();
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
 
-    if (threadIdx.x == 0)
-    {
-        #pragma unroll
-        for (int i = 0; i < BLOCK_SIZE; ++i)
-        {
-            float val = s_distance[i];
+            __syncthreads();
 
-            if (val < myBestDistance1)
+            if (threadIdx.x == 0)
             {
-                myBestDistance2 = myBestDistance1;
-                myBestTrainIdx2 = myBestTrainIdx1;
-
-                myBestDistance1 = val;
-                myBestTrainIdx1 = s_trainIdx[i];
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
             }
-            else if (val < myBestDistance2)
-            {
-                myBestDistance2 = val;
-                myBestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    __syncthreads();
 
-    s_distance[threadIdx.x] = bestDistance2;
-    s_trainIdx[threadIdx.x] = bestTrainIdx2;
+            __syncthreads();
 
-    __syncthreads();
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
 
-    if (threadIdx.x == 0)
-    {
-        #pragma unroll
-        for (int i = 0; i < BLOCK_SIZE; ++i)
-        {
-            float val = s_distance[i];
+            __syncthreads();
 
-            if (val < myBestDistance2)
+            if (threadIdx.x == 0)
             {
-                myBestDistance2 = val;
-                myBestTrainIdx2 = s_trainIdx[i];
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
             }
-        }
-    }
-
-    bestDistance1 = myBestDistance1;
-    bestDistance2 = myBestDistance2;
 
-    bestTrainIdx1 = myBestTrainIdx1;
-    bestTrainIdx2 = myBestTrainIdx2;
-}
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
 
-template <int BLOCK_SIZE> 
-__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
-                               int& bestTrainIdx1, int& bestTrainIdx2, 
-                               int& bestImgIdx1, int& bestImgIdx2, 
-                               float* s_distance, int* s_trainIdx, int* s_imgIdx)
-{
-    float myBestDistance1 = numeric_limits<float>::max(); 
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-    int myBestImgIdx1 = -1;
-    int myBestImgIdx2 = -1;
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+        }
 
-    s_distance += threadIdx.y * BLOCK_SIZE;
-    s_trainIdx += threadIdx.y * BLOCK_SIZE;
-    s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+        template <int BLOCK_SIZE> 
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 
+                                       int& bestTrainIdx1, int& bestTrainIdx2, 
+                                       int& bestImgIdx1, int& bestImgIdx2, 
+                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+            float myBestDistance1 = numeric_limits<float>::max(); 
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
 
-    s_distance[threadIdx.x] = bestDistance1;
-    s_trainIdx[threadIdx.x] = bestTrainIdx1;
-    s_imgIdx[threadIdx.x]   = bestImgIdx1;
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
 
-    __syncthreads();
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+            s_imgIdx[threadIdx.x]   = bestImgIdx1;
 
-    if (threadIdx.x == 0)
-    {
-        #pragma unroll
-        for (int i = 0; i < BLOCK_SIZE; ++i)
-        {
-            float val = s_distance[i];
-
-            if (val < myBestDistance1)
-            {
-                myBestDistance2 = myBestDistance1;
-                myBestTrainIdx2 = myBestTrainIdx1;
-                myBestImgIdx2   = myBestImgIdx1;
+            __syncthreads();
 
-                myBestDistance1 = val;
-                myBestTrainIdx1 = s_trainIdx[i];
-                myBestImgIdx1   = s_imgIdx[i];
-            }
-            else if (val < myBestDistance2)
+            if (threadIdx.x == 0)
             {
-                myBestDistance2 = val;
-                myBestTrainIdx2 = s_trainIdx[i];
-                myBestImgIdx2   = s_imgIdx[i];
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+                        myBestImgIdx2   = myBestImgIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                        myBestImgIdx1   = s_imgIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
             }
-        }
-    }
-
-    __syncthreads();
 
-    s_distance[threadIdx.x] = bestDistance2;
-    s_trainIdx[threadIdx.x] = bestTrainIdx2;
-    s_imgIdx[threadIdx.x]   = bestImgIdx2;
+            __syncthreads();
 
-    __syncthreads();
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+            s_imgIdx[threadIdx.x]   = bestImgIdx2;
 
-    if (threadIdx.x == 0)
-    {
-        #pragma unroll
-        for (int i = 0; i < BLOCK_SIZE; ++i)
-        {
-            float val = s_distance[i];
+            __syncthreads();
 
-            if (val < myBestDistance2)
+            if (threadIdx.x == 0)
             {
-                myBestDistance2 = val;
-                myBestTrainIdx2 = s_trainIdx[i];
-                myBestImgIdx2   = s_imgIdx[i];
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
             }
-        }
-    }
 
-    bestDistance1 = myBestDistance1;
-    bestDistance2 = myBestDistance2;
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
 
-    bestTrainIdx1 = myBestTrainIdx1;
-    bestTrainIdx2 = myBestTrainIdx2;
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
 
-    bestImgIdx1 = myBestImgIdx1;
-    bestImgIdx2 = myBestImgIdx2;
-}
+            bestImgIdx1 = myBestImgIdx1;
+            bestImgIdx2 = myBestImgIdx2;
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled Cached
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
-__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
-{
-    #pragma unroll
-    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
-        s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                                   typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                                   float& bestDistance1, float& bestDistance2, 
-                                   int& bestTrainIdx1, int& bestTrainIdx2, 
-                                   int& bestImgIdx1, int& bestImgIdx2)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
-
-        #pragma unroll
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
+        __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
         {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-            if (loadX < train.cols)
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
             {
-                T val;
-
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
             }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-            __syncthreads();
         }
 
-        typename Dist::result_type distVal = dist;
-
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
-
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                           float& bestDistance1, float& bestDistance2, 
+                                           int& bestTrainIdx1, int& bestTrainIdx2, 
+                                           int& bestImgIdx1, int& bestImgIdx2)
         {
-            if (distVal < bestDistance1)
-            {
-                bestImgIdx2   = bestImgIdx1;
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestImgIdx1   = imgIdx;
-                bestDistance1 = distVal;
-                bestTrainIdx1 = trainIdx;
-            }
-            else if (distVal < bestDistance2)
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
             {
-                bestImgIdx2   = imgIdx;
-                bestDistance2 = distVal;
-                bestTrainIdx2 = trainIdx;
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
             }
         }
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
-
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 
-    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
-
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-
-    loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
-
-    __syncthreads();
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                         const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
-                         cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
 
-    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            __syncthreads();
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
 
-    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-    int myBestImgIdx1 = -1;
-    int myBestImgIdx2 = -1;
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    Mask m = mask;
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
-    }
-
-    __syncthreads();
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                         const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
-                         cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 
-    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            Mask m = mask;
 
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                             float& bestDistance1, float& bestDistance2, 
-                             int& bestTrainIdx1, int& bestTrainIdx2, 
-                             int& bestImgIdx1, int& bestImgIdx2)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
+            __syncthreads();
 
-        #pragma unroll
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-        {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 
-            if (loadX < query.cols)
+            if (queryIdx < query.rows && threadIdx.x == 0)
             {
-                T val;
-
-                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
             }
+        }
 
-            __syncthreads();
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                                 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-            __syncthreads();
-        }
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-        typename Dist::result_type distVal = dist;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
 
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                     float& bestDistance1, float& bestDistance2, 
+                                     int& bestTrainIdx1, int& bestTrainIdx2, 
+                                     int& bestImgIdx1, int& bestImgIdx2)
         {
-            if (distVal < bestDistance1)
-            {
-                bestImgIdx2   = bestImgIdx1;
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestImgIdx1   = imgIdx;
-                bestDistance1 = distVal;
-                bestTrainIdx1 = trainIdx;
-            }
-            else if (distVal < bestDistance2)
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
             {
-                bestImgIdx2   = imgIdx;
-                bestDistance2 = distVal;
-                bestTrainIdx2 = trainIdx;
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
             }
         }
-    }
-}
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
 
-    loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 
-    __syncthreads();
+            __syncthreads();
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
-                   cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-    int myBestImgIdx1 = -1;
-    int myBestImgIdx2 = -1;
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
 
-    Mask m = mask;
+            Mask m = mask;
 
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
-    }
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
 
-    __syncthreads();
+            __syncthreads();
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
-                   cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
 
-    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-///////////////////////////////////////////////////////////////////////////////
-// Match
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                     float& bestDistance1, float& bestDistance2, 
-                     int& bestTrainIdx1, int& bestTrainIdx2, 
-                     int& bestImgIdx1, int& bestImgIdx2)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
 
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                             float& bestDistance1, float& bestDistance2, 
+                             int& bestTrainIdx1, int& bestTrainIdx2, 
+                             int& bestImgIdx1, int& bestImgIdx2)
         {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-            if (loadX < query.cols)
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
             {
-                T val;
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
 
-                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-            }
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-            __syncthreads();
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 
             __syncthreads();
-        }
 
-        typename Dist::result_type distVal = dist;
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
-        {
-            if (distVal < bestDistance1)
+            if (queryIdx < query.rows && threadIdx.x == 0)
             {
-                bestImgIdx2   = bestImgIdx1;
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestImgIdx1   = imgIdx;
-                bestDistance1 = distVal;
-                bestTrainIdx1 = trainIdx;
-            }
-            else if (distVal < bestDistance2)
-            {
-                bestImgIdx2   = imgIdx;
-                bestDistance2 = distVal;
-                bestTrainIdx2 = trainIdx;
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
             }
         }
-    }
-}
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    __syncthreads();
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, 
-           cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            Mask m = mask;
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
 
-    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            __syncthreads();
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
-{
-    extern __shared__ int smem[];
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    float myBestDistance1 = numeric_limits<float>::max();
-    float myBestDistance2 = numeric_limits<float>::max();
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-    int myBestImgIdx1 = -1;
-    int myBestImgIdx2 = -1;
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    Mask m = mask;
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
-    }
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    __syncthreads();
+        ///////////////////////////////////////////////////////////////////////////////
+        // knnMatch 2 dispatcher
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+        template <typename Dist, typename T, typename Mask> 
+        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                              const DevMem2Db& trainIdx, const DevMem2Db& distance, 
+                              int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+        }
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+        template <typename Dist, typename T, typename Mask> 
+        void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                              const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+                              int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
+            }
+        }
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
-        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
-        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
-    }
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, 
-           cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc distance kernel
 
-    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
 
-    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-///////////////////////////////////////////////////////////////////////////////
-// knnMatch 2 dispatcher
+            Dist dist;
 
-template <typename Dist, typename T, typename Mask> 
-void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                      const DevMem2Db& trainIdx, const DevMem2Db& distance, 
-                      int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-}
-
-template <typename Dist, typename T, typename Mask> 
-void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                      const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-                      int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-}
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {                
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
 
-///////////////////////////////////////////////////////////////////////////////
-// Calc distance kernel
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-__global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
-{
-    extern __shared__ int smem[];
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
 
-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 
-    Dist dist;
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    #pragma unroll
-    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
+            calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
 
-        if (loadX < query.cols)
-        {
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
-        }
-        else
-        {                
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        __syncthreads();
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-        __syncthreads();
-    }
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
 
-    if (queryIdx < query.rows && trainIdx < train.rows)
-    {
-        float distVal = numeric_limits<float>::max();
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 
-        if (mask(queryIdx, trainIdx))
-            distVal = (typename Dist::result_type)dist;
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-        allDist.ptr(queryIdx)[trainIdx] = distVal;
-    }
-}
+            Dist dist;
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {                
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
 
-    calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
-    cudaSafeCall( cudaGetLastError() );
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 
-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
 
-    Dist dist;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc Distance dispatcher
 
-        if (loadX < query.cols)
+        template <typename Dist, typename T, typename Mask> 
+        void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                                    const DevMem2Df& allDist, 
+                                    int cc, cudaStream_t stream)
         {
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
-        }
-        else
-        {                
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+            if (query.cols <= 64)
+            {
+                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
+            }*/
+            else
+            {
+                calcDistance<16, Dist>(query, train, mask, allDist, stream);
+            }
         }
 
-        __syncthreads();
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+        ///////////////////////////////////////////////////////////////////////////////
+        // find knn match kernel
 
-        __syncthreads();
-    }
+        template <int BLOCK_SIZE> 
+        __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
+        {
+            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+            __shared__ float s_dist[SMEM_SIZE];
+            __shared__ int s_trainIdx[SMEM_SIZE];
 
-    if (queryIdx < query.rows && trainIdx < train.rows)
-    {
-        float distVal = numeric_limits<float>::max();
+            const int queryIdx = blockIdx.x;
 
-        if (mask(queryIdx, trainIdx))
-            distVal = (typename Dist::result_type)dist;
+            float* allDistRow = allDist.ptr(queryIdx);
 
-        allDist.ptr(queryIdx)[trainIdx] = distVal;
-    }
-}
+            float dist = numeric_limits<float>::max();
+            int bestIdx = -1;
+            
+            for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
+            {
+                float reg = allDistRow[i];
+                if (reg < dist)
+                {
+                    dist = reg;
+                    bestIdx = i;
+                }
+            }
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            s_dist[threadIdx.x] = dist;
+            s_trainIdx[threadIdx.x] = bestIdx;
+            __syncthreads();
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
 
-    calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
-    cudaSafeCall( cudaGetLastError() );
+            if (threadIdx.x == 0)
+            {
+                if (dist < numeric_limits<float>::max())
+                {
+                    allDistRow[bestIdx] = numeric_limits<float>::max();
+                    trainIdx.ptr(queryIdx)[i] = bestIdx;
+                    distance.ptr(queryIdx)[i] = dist;
+                }
+            }
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        template <int BLOCK_SIZE> 
+        void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, 1, 1);
+            const dim3 grid(trainIdx.rows, 1, 1);
 
-///////////////////////////////////////////////////////////////////////////////
-// Calc Distance dispatcher
+            for (int i = 0; i < k; ++i)
+            {
+                findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
+                cudaSafeCall( cudaGetLastError() );
+            }
 
-template <typename Dist, typename T, typename Mask> 
-void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                            const DevMem2Df& allDist, 
-                            int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
-    }*/
-    else
-    {
-        calcDistance<16, Dist>(query, train, mask, allDist, stream);
-    }
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// find knn match kernel
+        void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
+        {
+            findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
+        }
 
-template <int BLOCK_SIZE> 
-__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
-{
-    const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
-    __shared__ float s_dist[SMEM_SIZE];
-    __shared__ int s_trainIdx[SMEM_SIZE];
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match Dispatcher
 
-    const int queryIdx = blockIdx.x;
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream)
+        {
+            if (k == 2)
+            {
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
+            }
+            else
+            {
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
+            }
+        }     
 
-    float* allDistRow = allDist.ptr(queryIdx);
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match caller
 
-    float dist = numeric_limits<float>::max();
-    int bestIdx = -1;
-    
-    for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
-    {
-        float reg = allDistRow[i];
-        if (reg < dist)
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream)
         {
-            dist = reg;
-            bestIdx = i;
+            if (mask.data)
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+            else
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
         }
-    }
 
-    s_dist[threadIdx.x] = dist;
-    s_trainIdx[threadIdx.x] = bestIdx;
-    __syncthreads();
+        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
-    reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
-
-    if (threadIdx.x == 0)
-    {
-        if (dist < numeric_limits<float>::max())
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
+            int cc, cudaStream_t stream)
         {
-            allDistRow[bestIdx] = numeric_limits<float>::max();
-            trainIdx.ptr(queryIdx)[i] = bestIdx;
-            distance.ptr(queryIdx)[i] = dist;
+            if (mask.data)
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+            else
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
         }
-    }
-}
-
-template <int BLOCK_SIZE> 
-void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, 1, 1);
-    const dim3 grid(trainIdx.rows, 1, 1);
-
-    for (int i = 0; i < k; ++i)
-    {
-        findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
-        cudaSafeCall( cudaGetLastError() );
-    }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
-void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
-{
-    findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
-}
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
+            else
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// knn match Dispatcher
+        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
 
-template <typename Dist, typename T, typename Mask>
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, 
-    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-    int cc, cudaStream_t stream)
-{
-    if (k == 2)
-    {
-        match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
-    }
-    else
-    {
-        calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
-    }
-}     
+        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+            else
+                match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// knn match caller
+        template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
 
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
-    else
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
-}
-
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
-    else
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
-}
-
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
-    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
-    else
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
-}
-
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
-
-template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-    int cc, cudaStream_t stream)
-{
-    if (masks.data)
-        match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
-    else
-        match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-}
-
-template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-
-template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-    int cc, cudaStream_t stream)
-{
-    if (masks.data)
-        match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
-    else
-        match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-}
-
-//template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-
-template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
-    int cc, cudaStream_t stream)
-{
-    if (masks.data)
-        match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
-    else
-        match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
-}
+        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+            else
+                match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+        }
 
-template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-//template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
-template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
 
-} // namespace bf_knnmatch
+        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+            int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
+            else
+                match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
+    } // namespace bf_knnmatch
+}}} // namespace cv { namespace gpu { namespace device {
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu
index 0ab56be..7d6d62b 100644
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -45,736 +45,734 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    namespace bf_match 
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
 
-namespace bf_match {
+        template <int BLOCK_SIZE> 
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
 
-///////////////////////////////////////////////////////////////////////////////
-// Reduction
+            s_distance[threadIdx.x] = bestDistance;
+            s_trainIdx[threadIdx.x] = bestTrainIdx;
 
-template <int BLOCK_SIZE> 
-__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
-{
-    s_distance += threadIdx.y * BLOCK_SIZE;
-    s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            __syncthreads();
 
-    s_distance[threadIdx.x] = bestDistance;
-    s_trainIdx[threadIdx.x] = bestTrainIdx;
+            reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+        }
 
-    __syncthreads();
+        template <int BLOCK_SIZE> 
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
 
-    reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
-}
+            s_distance[threadIdx.x] = bestDistance;
+            s_trainIdx[threadIdx.x] = bestTrainIdx;
+            s_imgIdx  [threadIdx.x] = bestImgIdx;
 
-template <int BLOCK_SIZE> 
-__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
-{
-    s_distance += threadIdx.y * BLOCK_SIZE;
-    s_trainIdx += threadIdx.y * BLOCK_SIZE;
-    s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+            __syncthreads();
 
-    s_distance[threadIdx.x] = bestDistance;
-    s_trainIdx[threadIdx.x] = bestTrainIdx;
-    s_imgIdx  [threadIdx.x] = bestImgIdx;
+            reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+        }
 
-    __syncthreads();
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
 
-    reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
-}
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
+        __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled Cached
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                           float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> 
-__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
-{
-    #pragma unroll
-    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
-        s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                                   typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                                   float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
 
-        #pragma unroll
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-        {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                    if (loadX < train.cols)
+                    {
+                        T val;
 
-            if (loadX < train.cols)
-            {
-                T val;
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
 
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-            }
+                    __syncthreads();
 
-            __syncthreads();
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+                    __syncthreads();
+                }
 
-            __syncthreads();
-        }
+                typename Dist::result_type distVal = dist;
 
-        typename Dist::result_type distVal = dist;
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
 
-        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
         {
-            bestImgIdx = imgIdx;
-            bestDistance = distVal;
-            bestTrainIdx = trainIdx;
-        }
-    }
-}
+            extern __shared__ int smem[];
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 
-    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 
-    loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+            __syncthreads();
 
-    __syncthreads();
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                         const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-                         cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                                 const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
-                                    int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 
-    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
-    int myBestImgIdx = -1;
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
 
-    Mask m = mask;
+            Mask m = mask;
 
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-    }
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
 
-    __syncthreads();
+            __syncthreads();
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestImgIdx[queryIdx] = myBestImgIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                         const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                         cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
 
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                                 const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
 
-        #pragma unroll
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
         {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
 
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
 
-            if (loadX < query.cols)
-            {
-                T val;
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
 
-                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+                typename Dist::result_type distVal = dist;
 
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
             }
+        }
 
-            __syncthreads();
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 
             __syncthreads();
-        }
 
-        typename Dist::result_type distVal = dist;
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
 
-        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                           const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                           cudaStream_t stream)
         {
-            bestImgIdx = imgIdx;
-            bestDistance = distVal;
-            bestTrainIdx = trainIdx;
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
-    }
-}
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    
-    loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    __syncthreads();
+            Mask m = mask;
+            
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            __syncthreads();
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                   const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-                   cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
-                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
-    int myBestImgIdx = -1;
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
+                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
 
-    Mask m = mask;
-    
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-    }
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 
-    __syncthreads();
+                    if (loadX < query.cols)
+                    {
+                        T val;
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestImgIdx[queryIdx] = myBestImgIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                   cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+                    __syncthreads();
 
-    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 
-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+                    __syncthreads();
+                }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+                typename Dist::result_type distVal = dist;
 
-///////////////////////////////////////////////////////////////////////////////
-// Match
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, 
-                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, 
-                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-{
-    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-    {
-        Dist dist;
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
 
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
         {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
+            extern __shared__ int smem[];
 
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-            if (loadX < query.cols)
-            {
-                T val;
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
 
-                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-            }
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 
             __syncthreads();
 
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-            __syncthreads();
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
         }
 
-        typename Dist::result_type distVal = dist;
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                   const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
-        {
-            bestImgIdx = imgIdx;
-            bestDistance = distVal;
-            bestTrainIdx = trainIdx;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
-    }
-}
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
+                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    
-    loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    __syncthreads();
+            Mask m = mask;
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const DevMem2D_<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            __syncthreads();
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-           const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-           cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
 
-    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, 
-                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-{
-    extern __shared__ int smem[];
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
 
-    float myBestDistance = numeric_limits<float>::max();
-    int myBestTrainIdx = -1;
-    int myBestImgIdx = -1;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
 
-    Mask m = mask;
-    for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-    {
-        const DevMem2D_<T> train = trains[imgIdx];
-        m.next();
-        loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-    }
+        template <typename Dist, typename T, typename Mask> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+        }
 
-    __syncthreads();
+        template <typename Dist, typename T, typename Mask> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+        }
 
-    float* s_distance = (float*)(smem);
-    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-    int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match caller
 
-    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+                                               const DevMem2Di& trainIdx, const DevMem2Df& distance,
+                                               int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-    if (queryIdx < query.rows && threadIdx.x == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestImgIdx[queryIdx] = myBestImgIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-           cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+                                               const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                                               int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-    cudaSafeCall( cudaGetLastError() );
+        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+                                                    const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+                                                    int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
+                    trainIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// Match dispatcher
+        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-template <typename Dist, typename T, typename Mask> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, 
-                     const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-}
-
-template <typename Dist, typename T, typename Mask> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, 
-                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-}
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                                               int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// Match caller
+        template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-                                       const DevMem2Di& trainIdx, const DevMem2Df& distance,
-                                       int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-}
-
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-                                       const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-                                       int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-}
-
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-                                            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
-                                            int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), 
-            trainIdx, distance, 
-            cc, stream);
-    }
-}
-
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                       const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                                       int cc, cudaStream_t stream)
-{
-    if (masks.data)
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-}
-
-template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                       const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                                       int cc, cudaStream_t stream)
-{
-    if (masks.data)
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-}
-
-//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                                            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
-                                            int cc, cudaStream_t stream)
-{
-    if (masks.data)
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
-            trainIdx, imgIdx, distance, 
-            cc, stream);
-    }
-}
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+                                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                                               int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
 
-} // namespace bf_match
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+                                                    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+                                                    int cc, cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), 
+                    trainIdx, imgIdx, distance, 
+                    cc, stream);
+            }
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
+    } // namespace bf_match
+}}} // namespace cv { namespace gpu { namespace device {
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
index 519ed7f..39b721a 100644
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -45,423 +45,421 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bf_radius_match {
-
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+namespace cv { namespace gpu { namespace device 
 {
-    #if __CUDA_ARCH__ >= 110
-
-    extern __shared__ int smem[];
-
-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+    namespace bf_radius_match 
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
 
-    Dist dist;
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if __CUDA_ARCH__ >= 110
 
-    #pragma unroll
-    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
+            extern __shared__ int smem[];
 
-        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 
-        if (loadX < query.cols)
-        {
-            T val;
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+            Dist dist;
 
-            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-        }
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
 
-        __syncthreads();
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 
-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+                if (loadX < query.cols)
+                {
+                    T val;
 
-        __syncthreads();
-    }
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 
-    float distVal = (typename Dist::result_type)dist;
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
 
-    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-    {
-        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-        if (ind < maxCount)
-        {
-            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-            bestDistance.ptr(queryIdx)[ind] = distVal;
-        }
-    }
+                __syncthreads();
 
-    #endif
-}
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+                __syncthreads();
+            }
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            float distVal = (typename Dist::result_type)dist;
 
-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-    cudaSafeCall( cudaGetLastError() );
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}   
+            #endif
+        }
 
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    for (int i = 0; i < n; ++i)
-    {
-        const DevMem2D_<T> train = trains[i];
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
 
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }   
 
-        if (masks != 0 && masks[i].data)
-        {
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-        }
-        else
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
         {
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const DevMem2D_<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
-        cudaSafeCall( cudaGetLastError() );
-    }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
 
-///////////////////////////////////////////////////////////////////////////////
-// Match
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if __CUDA_ARCH__ >= 110
 
-template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-{
-    #if __CUDA_ARCH__ >= 110
+            extern __shared__ int smem[];
 
-    extern __shared__ int smem[];
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 
-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            Dist dist;
 
-    Dist dist;
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
 
-    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 
-        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                if (loadX < query.cols)
+                {
+                    T val;
 
-        if (loadX < query.cols)
-        {
-            T val;
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 
-            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
 
-            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-        }
+                __syncthreads();
 
-        __syncthreads();
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 
-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+                __syncthreads();
+            }
 
-        __syncthreads();
-    }
+            float distVal = (typename Dist::result_type)dist;
 
-    float distVal = (typename Dist::result_type)dist;
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
 
-    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-    {
-        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-        if (ind < maxCount)
-        {
-            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-            bestDistance.ptr(queryIdx)[ind] = distVal;
+            #endif
         }
-    }
 
-    #endif
-}
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
 
-    match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        template <int BLOCK_SIZE, typename Dist, typename T> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const DevMem2D_<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <int BLOCK_SIZE, typename Dist, typename T> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
 
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        template <typename Dist, typename T, typename Mask> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }
 
-    for (int i = 0; i < n; ++i)
-    {
-        const DevMem2D_<T> train = trains[i];
+        template <typename Dist, typename T> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        } 
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller
+
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+        }
 
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-        if (masks != 0 && masks[i].data)
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
         {
-            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
         }
-        else
+
+        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
         {
-            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
         }
-        cudaSafeCall( cudaGetLastError() );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
 
-///////////////////////////////////////////////////////////////////////////////
-// Match dispatcher
+        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-template <typename Dist, typename T, typename Mask> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-                     const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-}
-
-template <typename Dist, typename T> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-} 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }
 
-///////////////////////////////////////////////////////////////////////////////
-// Radius Match caller
+        template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
-
-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
-
-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
-
-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
-
-template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
-
-//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }
 
-template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
 
-} // namespace bf_radius_match
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
index 4d3d9bc..0e2aa28 100644
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,186 +43,184 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bilateral_filter {
-
-__constant__ float* ctable_color;
-__constant__ float* ctable_space;
-__constant__ size_t ctable_space_step;
-
-__constant__ int cndisp;
-__constant__ int cradius;
-
-__constant__ short cedge_disc;
-__constant__ short cmax_disc;
-
-void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
-    size_t table_space_step = table_space.step / sizeof(float);
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
-
-    cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-}
-
-template <int channels>
-struct DistRgbMax
-{
-    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-    {
-        uchar x = ::abs(a[0] - b[0]);
-        uchar y = ::abs(a[1] - b[1]);
-        uchar z = ::abs(a[2] - b[2]);
-        return (::max(::max(x, y), z));
-    }
-};
-
-template <>
-struct DistRgbMax<1>
+namespace cv { namespace gpu { namespace device 
 {
-    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+    namespace bilateral_filter 
     {
-        return ::abs(a[0] - b[0]);
-    }
-};
+        __constant__ float* ctable_color;
+        __constant__ float* ctable_space;
+        __constant__ size_t ctable_space_step;
 
-template <int channels, typename T>
-__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-    T dp[5];
+        __constant__ int cndisp;
+        __constant__ int cradius;
 
-    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-    {
-        dp[0] = *(disp + (y  ) * disp_step + x + 0);
-        dp[1] = *(disp + (y-1) * disp_step + x + 0);
-        dp[2] = *(disp + (y  ) * disp_step + x - 1);
-        dp[3] = *(disp + (y+1) * disp_step + x + 0);
-        dp[4] = *(disp + (y  ) * disp_step + x + 1);
+        __constant__ short cedge_disc;
+        __constant__ short cmax_disc;
 
-        if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
+        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
         {
-            const int ymin = ::max(0, y - cradius);
-            const int xmin = ::max(0, x - cradius);
-            const int ymax = ::min(h - 1, y + cradius);
-            const int xmax = ::min(w - 1, x + cradius);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
+            size_t table_space_step = table_space.step / sizeof(float);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
 
-            float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
 
-            const uchar* ic = img + y * img_step + channels * x;
+            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
+        }
 
-            for(int yi = ymin; yi <= ymax; yi++)
+        template <int channels>
+        struct DistRgbMax
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
             {
-                const T* disp_y = disp + yi * disp_step;
+                uchar x = ::abs(a[0] - b[0]);
+                uchar y = ::abs(a[1] - b[1]);
+                uchar z = ::abs(a[2] - b[2]);
+                return (::max(::max(x, y), z));
+            }
+        };
 
-                for(int xi = xmin; xi <= xmax; xi++)
-                {
-                    const uchar* in = img + yi * img_step + channels * xi;
+        template <>
+        struct DistRgbMax<1>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                return ::abs(a[0] - b[0]);
+            }
+        };
 
-                    uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+        template <int channels, typename T>
+        __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
 
-                    const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+            T dp[5];
 
-                    const T disp_reg = disp_y[xi];
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                dp[3] = *(disp + (y+1) * disp_step + x + 0);
+                dp[4] = *(disp + (y  ) * disp_step + x + 1);
 
-                    cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
-                    cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
-                    cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
-                    cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
-                    cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
+                {
+                    const int ymin = ::max(0, y - cradius);
+                    const int xmin = ::max(0, x - cradius);
+                    const int ymax = ::min(h - 1, y + cradius);
+                    const int xmax = ::min(w - 1, x + cradius);
+
+                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                    const uchar* ic = img + y * img_step + channels * x;
+
+                    for(int yi = ymin; yi <= ymax; yi++)
+                    {
+                        const T* disp_y = disp + yi * disp_step;
+
+                        for(int xi = xmin; xi <= xmax; xi++)
+                        {
+                            const uchar* in = img + yi * img_step + channels * xi;
+
+                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+
+                            const T disp_reg = disp_y[xi];
+
+                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        }
+                    }
+
+                    float minimum = numeric_limits<float>::max();
+                    int id = 0;
+
+                    if (cost[0] < minimum)
+                    {
+                        minimum = cost[0];
+                        id = 0;
+                    }
+                    if (cost[1] < minimum)
+                    {
+                        minimum = cost[1];
+                        id = 1;
+                    }
+                    if (cost[2] < minimum)
+                    {
+                        minimum = cost[2];
+                        id = 2;
+                    }
+                    if (cost[3] < minimum)
+                    {
+                        minimum = cost[3];
+                        id = 3;
+                    }
+                    if (cost[4] < minimum)
+                    {
+                        minimum = cost[4];
+                        id = 4;
+                    }
+
+                    *(disp + y * disp_step + x) = dp[id];
                 }
             }
+        }
 
-            float minimum = numeric_limits<float>::max();
-            int id = 0;
+        template <typename T>     
+        void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x << 1);
+            grid.y = divUp(disp.rows, threads.y);
 
-            if (cost[0] < minimum)
+            switch (channels)
             {
-                minimum = cost[0];
-                id = 0;
-            }
-            if (cost[1] < minimum)
-            {
-                minimum = cost[1];
-                id = 1;
-            }
-            if (cost[2] < minimum)
-            {
-                minimum = cost[2];
-                id = 2;
-            }
-            if (cost[3] < minimum)
-            {
-                minimum = cost[3];
-                id = 3;
-            }
-            if (cost[4] < minimum)
-            {
-                minimum = cost[4];
-                id = 4;
+            case 1:
+                for (int i = 0; i < iters; ++i)
+                {
+                    bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            case 3:
+                for (int i = 0; i < iters; ++i)
+                {
+                    bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            default:
+                cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
             }
 
-            *(disp + y * disp_step + x) = dp[id];
+            if (stream != 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
-    }
-}
 
-template <typename T>     
-void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-    grid.x = divUp(disp.cols, threads.x << 1);
-    grid.y = divUp(disp.rows, threads.y);
-
-    switch (channels)
-    {
-    case 1:
-        for (int i = 0; i < iters; ++i)
+        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
         {
-            bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
+            bilateral_filter_caller(disp, img, channels, iters, stream);
         }
-        break;
-    case 3:
-        for (int i = 0; i < iters; ++i)
-        {
-            bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
 
-            bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
+        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
+            bilateral_filter_caller(disp, img, channels, iters, stream);
         }
-        break;
-    default:
-        cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }
-
-    if (stream != 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    bilateral_filter_caller(disp, img, channels, iters, stream);
-}
-
-void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    bilateral_filter_caller(disp, img, channels, iters, stream);
-}
-
-} // namespace bilateral_filter
-
-END_OPENCV_DEVICE_NAMESPACE
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu
index fca1b96..02e9649 100644
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -42,77 +42,75 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace blend {
-
-template <typename T>
-__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                  const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+namespace cv { namespace gpu { namespace device 
 {
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (y < rows && x < cols)
+    namespace blend 
     {
-        int x_ = x / cn;
-        float w1 = weights1.ptr(y)[x_];
-        float w2 = weights2.ptr(y)[x_];
-        T p1 = img1.ptr(y)[x];
-        T p2 = img2.ptr(y)[x];
-        result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-    }
-}	
-
-template <typename T>
-void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-    
-    blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }	
 
-template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+            
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
 
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
 
-__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                      const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
 
-    if (y < rows && x < cols)
-    {
-        float w1 = weights1.ptr(y)[x];
-        float w2 = weights2.ptr(y)[x];
-        float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-        w1 *= sum_inv;
-        w2 *= sum_inv;
-        uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-        uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-        ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                  p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-    }
-}
 
-void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-    
-    blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-    cudaSafeCall( cudaGetLastError() );
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
 
-} // namespace blend 
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
 
-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend 
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index 1cdf191..27c2afb 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -44,149 +44,148 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
-
-namespace transform_points
+namespace cv { namespace gpu { namespace device 
 {
-    __constant__ float3 crot0;
-    __constant__ float3 crot1;
-    __constant__ float3 crot2;
-    __constant__ float3 ctransl;
+    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
 
-    struct TransformOp : unary_function<float3, float3>
+    namespace transform_points
     {
-        __device__ __forceinline__ float3 operator()(const float3& p) const
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+
+        struct TransformOp : unary_function<float3, float3>
         {
-            return make_float3(
-                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            __device__ __forceinline__ float3 operator()(const float3& p) const
+            {
+                return make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            }
+        };
+
+        void call(const DevMem2D_<float3> src, const float* rot,
+                  const float* transl, DevMem2D_<float3> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            ::cv::gpu::device::transform(src, dst, TransformOp(), stream);
         }
-    };
+    } // namespace transform_points
 
-    void call(const DevMem2D_<float3> src, const float* rot,
-              const float* transl, DevMem2D_<float3> dst,
-              cudaStream_t stream)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
-    }
-} // namespace transform_points
-
-namespace project_points
-{
-    __constant__ float3 crot0;
-    __constant__ float3 crot1;
-    __constant__ float3 crot2;
-    __constant__ float3 ctransl;
-    __constant__ float3 cproj0;
-    __constant__ float3 cproj1;
-
-    struct ProjectOp : unary_function<float3, float3>
+    namespace project_points
     {
-        __device__ __forceinline__ float2 operator()(const float3& p) const
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+        __constant__ float3 cproj0;
+        __constant__ float3 cproj1;
+
+        struct ProjectOp : unary_function<float3, float3>
         {
-            // Rotate and translate in 3D
-            float3 t = make_float3(
-                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-            // Project on 2D plane
-            return make_float2(
-                    (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
-                    (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            __device__ __forceinline__ float2 operator()(const float3& p) const
+            {
+                // Rotate and translate in 3D
+                float3 t = make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+                // Project on 2D plane
+                return make_float2(
+                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            }
+        };
+
+        void call(const DevMem2D_<float3> src, const float* rot,
+                  const float* transl, const float* proj, DevMem2D_<float2> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+            ::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
         }
-    };
+    } // namespace project_points
 
-    void call(const DevMem2D_<float3> src, const float* rot,
-              const float* transl, const float* proj, DevMem2D_<float2> dst,
-              cudaStream_t stream)
+    namespace solve_pnp_ransac
     {
-        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
-    }
-} // namespace project_points
-
-namespace solve_pnp_ransac
-{
-    __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
-    __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
 
-    int maxNumIters()
-    {
-        return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
-    }
-
-    __device__ __forceinline__ float sqr(float x)
-    {
-        return x * x;
-    }
-
-    __global__ void computeHypothesisScoresKernel(
-            const int num_points, const float3* object, const float2* image,
-            const float dist_threshold, int* g_num_inliers)
-    {
-        const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
-        const float3 &transl_vec = ctransl_vectors[blockIdx.x];
-        int num_inliers = 0;
-
-        for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+        int maxNumIters()
         {
-            float3 p = object[i];
-            p = make_float3(
-                    rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
-                    rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
-                    rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
-            p.x /= p.z;
-            p.y /= p.z;
-            float2 image_p = image[i];
-            if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
-                ++num_inliers;
+            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
         }
 
-        extern __shared__ float s_num_inliers[];
-        s_num_inliers[threadIdx.x] = num_inliers;
-        __syncthreads();
+        __device__ __forceinline__ float sqr(float x)
+        {
+            return x * x;
+        }
 
-        for (int step = blockDim.x / 2; step > 0; step >>= 1)
+        __global__ void computeHypothesisScoresKernel(
+                const int num_points, const float3* object, const float2* image,
+                const float dist_threshold, int* g_num_inliers)
         {
-            if (threadIdx.x < step)
-                s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+            int num_inliers = 0;
+
+            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+            {
+                float3 p = object[i];
+                p = make_float3(
+                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+                p.x /= p.z;
+                p.y /= p.z;
+                float2 image_p = image[i];
+                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                    ++num_inliers;
+            }
+
+            extern __shared__ float s_num_inliers[];
+            s_num_inliers[threadIdx.x] = num_inliers;
             __syncthreads();
-        }
 
-        if (threadIdx.x == 0)
-            g_num_inliers[blockIdx.x] = s_num_inliers[0];
-    }
+            for (int step = blockDim.x / 2; step > 0; step >>= 1)
+            {
+                if (threadIdx.x < step)
+                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+                __syncthreads();
+            }
 
-    void computeHypothesisScores(
-            const int num_hypotheses, const int num_points, const float* rot_matrices,
-            const float3* transl_vectors, const float3* object, const float2* image,
-            const float dist_threshold, int* hypothesis_scores)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+            if (threadIdx.x == 0)
+                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+        }
 
-        dim3 threads(256);
-        dim3 grid(num_hypotheses);
-        int smem_size = threads.x * sizeof(float);
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
 
-        computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
-                num_points, object, image, dist_threshold, hypothesis_scores);
-        cudaSafeCall( cudaGetLastError() );
+            dim3 threads(256);
+            dim3 grid(num_hypotheses);
+            int smem_size = threads.x * sizeof(float);
 
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-} // namespace solvepnp_ransac
+            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+                    num_points, object, image, dist_threshold, hypothesis_scores);
+            cudaSafeCall( cudaGetLastError() );
 
-END_OPENCV_DEVICE_NAMESPACE
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace solvepnp_ransac
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index 5f31fa7..bf31eee 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -44,450 +44,448 @@
 #include <algorithm>
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace canny {
-
-__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+namespace cv { namespace gpu { namespace device 
 {
-    __shared__ int smem[16][18];
-
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i < rows)
+    namespace canny 
     {
-        smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-        if (threadIdx.x == 0)
+        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
         {
-            smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
-            smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
+            __shared__ int smem[16][18];
+
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i < rows)
+            {
+                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
+                if (threadIdx.x == 0)
+                {
+                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
+                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
+                }
+                __syncthreads();
+
+                if (j < cols)
+                {
+                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
+                }
+            }
         }
-        __syncthreads();
 
-        if (j < cols)
+        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
         {
-            dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-            dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
         }
-    }
-}
 
-void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        struct L1
+        {
+            static __device__ __forceinline__ float calc(int x, int y)
+            {
+                return ::abs(x) + ::abs(y);
+            }
+        };
+        struct L2
+        {
+            static __device__ __forceinline__ float calc(int x, int y)
+            {
+                return ::sqrtf(x * x + y * y);
+            }
+        };
 
-    calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        {
+            __shared__ int sdx[18][16];
+            __shared__ int sdy[18][16];
 
-    cudaSafeCall(cudaThreadSynchronize());
-}
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
 
-struct L1
-{
-    static __device__ __forceinline__ float calc(int x, int y)
-    {
-        return ::abs(x) + ::abs(y);
-    }
-};
-struct L2
-{
-    static __device__ __forceinline__ float calc(int x, int y)
-    {
-        return ::sqrtf(x * x + y * y);
-    }
-};
+            if (j < cols)
+            {
+                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
+                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
+                if (threadIdx.y == 0)
+                {
+                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
+                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
 
-template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
-    PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
-    __shared__ int sdx[18][16];
-    __shared__ int sdy[18][16];
+                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
+                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
+                }
+                __syncthreads();
 
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
+                if (i < rows)
+                {
+                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
+                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
 
-    if (j < cols)
-    {
-        sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-        sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-        if (threadIdx.y == 0)
-        {
-            sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
-            sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
+                    dx.ptr(i)[j] = x;
+                    dy.ptr(i)[j] = y;
 
-            sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
-            sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
+                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+                }
+            }
         }
-        __syncthreads();
 
-        if (i < rows)
+        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
         {
-            int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-            int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
 
-            dx.ptr(i)[j] = x;
-            dy.ptr(i)[j] = y;
+            if (L2Grad)
+                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            else
+                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
 
-            mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
         }
-    }
-}
 
-void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (L2Grad)
-        calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-    else
-        calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            if (i < rows && j < cols)
+                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+        }
 
-    cudaSafeCall( cudaGetLastError() );
+        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
 
-    cudaSafeCall(cudaThreadSynchronize());
-}
+            if (L2Grad)
+                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+            else
+                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
 
-template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
+            cudaSafeCall( cudaGetLastError() );
 
-    if (i < rows && j < cols)
-        mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
-}
+            cudaSafeCall(cudaThreadSynchronize());
+        }
 
-void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        //////////////////////////////////////////////////////////////////////////////////////////
+            
+        #define CANNY_SHIFT 15
+        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
 
-    if (L2Grad)
-        calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
-    else
-        calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        {
+            __shared__ float smem[18][18];
 
-    cudaSafeCall( cudaGetLastError() );
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;
 
-    cudaSafeCall(cudaThreadSynchronize());
-}
+            const int tid = threadIdx.y * 16 + threadIdx.x;
+            const int lx = tid % 18;
+            const int ly = tid / 18;
 
-//////////////////////////////////////////////////////////////////////////////////////////
-    
-#define CANNY_SHIFT 15
-#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+            if (ly < 14)
+                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
 
-__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
-    __shared__ float smem[18][18];
+            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
 
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
+            __syncthreads();
 
-    const int tid = threadIdx.y * 16 + threadIdx.x;
-    const int lx = tid % 18;
-    const int ly = tid / 18;
+            if (i < rows && j < cols)
+            {
+                int x = dx.ptr(i)[j];
+                int y = dy.ptr(i)[j];
+                const int s = (x ^ y) < 0 ? -1 : 1;
+                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
 
-    if (ly < 14)
-        smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+                x = ::abs(x);
+                y = ::abs(y);
 
-    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-        smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+                // 0 - the pixel can not belong to an edge
+                // 1 - the pixel might belong to an edge
+                // 2 - the pixel does belong to an edge
+                int edge_type = 0;
 
-    __syncthreads();
+                if (m > low_thresh)
+                {
+                    const int tg22x = x * TG22;
+                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
 
-    if (i < rows && j < cols)
-    {
-        int x = dx.ptr(i)[j];
-        int y = dy.ptr(i)[j];
-        const int s = (x ^ y) < 0 ? -1 : 1;
-        const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+                    y <<= CANNY_SHIFT;
 
-        x = ::abs(x);
-        y = ::abs(y);
+                    if (y < tg22x)
+                    {
+                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                            edge_type = 1 + (int)(m > high_thresh);
+                    }
+                    else if( y > tg67x )
+                    {
+                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                            edge_type = 1 + (int)(m > high_thresh);
+                    }
+                    else
+                    {
+                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                            edge_type = 1 + (int)(m > high_thresh);
+                    }
+                }
+                
+                map.ptr(i + 1)[j + 1] = edge_type;
+            }
+        }
 
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
+        #undef CANNY_SHIFT
+        #undef TG22
 
-        if (m > low_thresh)
+        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
         {
-            const int tg22x = x * TG22;
-            const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
 
-            y <<= CANNY_SHIFT;
+            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+            cudaSafeCall( cudaGetLastError() );
 
-            if (y < tg22x)
-            {
-                if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if( y > tg67x )
-            {
-                if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
+            cudaSafeCall(cudaThreadSynchronize());
         }
-        
-        map.ptr(i + 1)[j + 1] = edge_type;
-    }
-}
 
-#undef CANNY_SHIFT
-#undef TG22
-
-void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        //////////////////////////////////////////////////////////////////////////////////////////
 
-    calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
-    cudaSafeCall( cudaGetLastError() );
+        __device__ unsigned int counter = 0;
 
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
+        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+        {
+            #if __CUDA_ARCH__ >= 120
 
-__device__ unsigned int counter = 0;
+            __shared__ int smem[18][18];
 
-__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
-{
-    #if __CUDA_ARCH__ >= 120
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;
 
-    __shared__ int smem[18][18];
+            const int tid = threadIdx.y * 16 + threadIdx.x;
+            const int lx = tid % 18;
+            const int ly = tid / 18; 
 
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
+            if (ly < 14)
+                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
 
-    const int tid = threadIdx.y * 16 + threadIdx.x;
-    const int lx = tid % 18;
-    const int ly = tid / 18; 
+            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
 
-    if (ly < 14)
-        smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+            __syncthreads();
 
-    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-        smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+            if (i < rows && j < cols)
+            {
+                int n;
 
-    __syncthreads();
+                #pragma unroll
+                for (int k = 0; k < 16; ++k)
+                {
+                    n = 0;
 
-    if (i < rows && j < cols)
-    {
-        int n;
+                    if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+                    {
+                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+                        
+                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+                        
+                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+                    }
 
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
-        {
-            n = 0;
+                    if (n > 0)
+                        smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+                }
 
-            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-                
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-                
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-            }
+                const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
 
-            if (n > 0)
-                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-        }
+                map.ptr(i + 1)[j + 1] = e;
 
-        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+                n = 0;
 
-        map.ptr(i + 1)[j + 1] = e;
+                if (e == 2)
+                {
+                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+                    
+                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+                    
+                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+                }
 
-        n = 0;
+                if (n > 0)
+                {
+                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
+                    st[ind] = make_ushort2(j + 1, i + 1);
+                }
+            }
 
-        if (e == 2)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-            
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-            
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+            #endif
         }
 
-        if (n > 0)
+        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
         {
-            const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-            st[ind] = make_ushort2(j + 1, i + 1);
-        }
-    }
-
-    #endif
-}
-
-void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-__constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-__constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-{
-    #if __CUDA_ARCH__ >= 120
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
 
-    const int stack_size = 512;
-    
-    __shared__ unsigned int s_counter;
-    __shared__ unsigned int s_ind;
-    __shared__ ushort2 s_st[stack_size];
+            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (threadIdx.x == 0)
-        s_counter = 0;
-    __syncthreads();
+            cudaSafeCall(cudaThreadSynchronize());
+        }
 
-    int ind = blockIdx.y * gridDim.x + blockIdx.x;
+        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
-    if (ind < count)
-    {
-        ushort2 pos = st1[ind];
-
-        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
         {
-            if (threadIdx.x < 8)
-            {
-                pos.x += c_dx[threadIdx.x];
-                pos.y += c_dy[threadIdx.x];
+            #if __CUDA_ARCH__ >= 120
 
-                if (map.ptr(pos.y)[pos.x] == 1)
-                {
-                    map.ptr(pos.y)[pos.x] = 2;
-
-                    ind = atomicInc(&s_counter, (unsigned int)(-1));
+            const int stack_size = 512;
+            
+            __shared__ unsigned int s_counter;
+            __shared__ unsigned int s_ind;
+            __shared__ ushort2 s_st[stack_size];
 
-                    s_st[ind] = pos;
-                }
-            }
+            if (threadIdx.x == 0)
+                s_counter = 0;
             __syncthreads();
 
-            while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-            {
-                const int subTaskIdx = threadIdx.x >> 3;
-                const int portion = ::min(s_counter, blockDim.x >> 3);
+            int ind = blockIdx.y * gridDim.x + blockIdx.x;
 
-                pos.x = pos.y = 0;
+            if (ind < count)
+            {
+                ushort2 pos = st1[ind];
 
-                if (subTaskIdx < portion)
-                    pos = s_st[s_counter - 1 - subTaskIdx];
-                __syncthreads();
-                    
-                if (threadIdx.x == 0)
-                    s_counter -= portion;
-                __syncthreads();
-                 
                 if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
                 {
-                    pos.x += c_dx[threadIdx.x & 7];
-                    pos.y += c_dy[threadIdx.x & 7];
-
-                    if (map.ptr(pos.y)[pos.x] == 1)
+                    if (threadIdx.x < 8)
                     {
-                        map.ptr(pos.y)[pos.x] = 2;
+                        pos.x += c_dx[threadIdx.x];
+                        pos.y += c_dy[threadIdx.x];
 
-                        ind = atomicInc(&s_counter, (unsigned int)(-1));
+                        if (map.ptr(pos.y)[pos.x] == 1)
+                        {
+                            map.ptr(pos.y)[pos.x] = 2;
 
-                        s_st[ind] = pos;
-                    }
-                }
-                __syncthreads();
-            }
+                            ind = atomicInc(&s_counter, (unsigned int)(-1));
 
-            if (s_counter > 0)
-            {
-                if (threadIdx.x == 0)
-                {
-                    ind = atomicAdd(&counter, s_counter);
-                    s_ind = ind - s_counter;
-                }
-                __syncthreads();
+                            s_st[ind] = pos;
+                        }
+                    }
+                    __syncthreads();
 
-                ind = s_ind;
+                    while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+                    {
+                        const int subTaskIdx = threadIdx.x >> 3;
+                        const int portion = ::min(s_counter, blockDim.x >> 3);
+
+                        pos.x = pos.y = 0;
+
+                        if (subTaskIdx < portion)
+                            pos = s_st[s_counter - 1 - subTaskIdx];
+                        __syncthreads();
+                            
+                        if (threadIdx.x == 0)
+                            s_counter -= portion;
+                        __syncthreads();
+                         
+                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                        {
+                            pos.x += c_dx[threadIdx.x & 7];
+                            pos.y += c_dy[threadIdx.x & 7];
+
+                            if (map.ptr(pos.y)[pos.x] == 1)
+                            {
+                                map.ptr(pos.y)[pos.x] = 2;
+
+                                ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                                s_st[ind] = pos;
+                            }
+                        }
+                        __syncthreads();
+                    }
 
-                for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                {
-                    st2[ind + i] = s_st[i];
+                    if (s_counter > 0)
+                    {
+                        if (threadIdx.x == 0)
+                        {
+                            ind = atomicAdd(&counter, s_counter);
+                            s_ind = ind - s_counter;
+                        }
+                        __syncthreads();
+
+                        ind = s_ind;
+
+                        for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                        {
+                            st2[ind + i] = s_st[i];
+                        }
+                    }
                 }
             }
-        }
-    }
-
-    #endif
-}
-
-void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
-{
-    void* counter_ptr;
-    cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-    
-    unsigned int count;
-    cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
-    while (count > 0)
-    {
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            #endif
+        }
 
-        dim3 block(128, 1, 1);
-        dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
-        edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
-        cudaSafeCall( cudaGetLastError() );
+        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+            
+            unsigned int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
-        cudaSafeCall(cudaThreadSynchronize());
+            while (count > 0)
+            {
+                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
 
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+                dim3 block(128, 1, 1);
+                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
+                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
+                cudaSafeCall( cudaGetLastError() );
 
-        std::swap(st1, st2);
-    }
-}
+                cudaSafeCall(cudaThreadSynchronize());
 
-__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
+                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 
-    if (i < rows && j < cols)
-        dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-}
+                std::swap(st1, st2);
+            }
+        }
 
-void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;
 
-    getEdges<<<grid, block>>>(map, dst, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
+            if (i < rows && j < cols)
+                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+        }
 
-    cudaSafeCall(cudaThreadSynchronize());
-}
+        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
 
-} // namespace canny
+            getEdges<<<grid, block>>>(map, dst, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
 
-END_OPENCV_DEVICE_NAMESPACE
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+    } // namespace canny
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 4da3f77..9384ea6 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -44,181 +44,181 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/color.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-{
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-{
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};    
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+namespace cv { namespace gpu { namespace device 
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };    
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
     void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
@@ -226,7 +226,7 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
         traits::functor_type functor = traits::create_functor(); \
         typedef typename traits::functor_type::argument_type src_t; \
         typedef typename traits::functor_type::result_type   dst_t; \
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
+        ::cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
     }
 
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
@@ -243,138 +243,137 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
 
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
-
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
-
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-
-END_OPENCV_DEVICE_NAMESPACE
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
index c16ca82..df85641 100644
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -47,203 +47,201 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 4
-#define RESULT_STEPS 8
-#define HALO_STEPS 1
-
-namespace column_filter {
+namespace cv { namespace gpu { namespace device 
+{
+    #define MAX_KERNEL_SIZE 16
+    #define BLOCK_DIM_X 16
+    #define BLOCK_DIM_Y 4
+    #define RESULT_STEPS 8
+    #define HALO_STEPS 1
 
-__constant__ float c_kernel[MAX_KERNEL_SIZE];
+    namespace column_filter 
+    {
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
 
-void loadKernel(const float kernel[], int ksize)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-}
+        void loadKernel(const float kernel[], int ksize)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
+        }
 
-template <int KERNEL_SIZE, typename T, typename D, typename B>
-__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-{
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+        template <int KERNEL_SIZE, typename T, typename D, typename B>
+        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
 
-    __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
+            __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
 
-    //Offset to the upper halo edge
-    const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-    const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
+            //Offset to the upper halo edge
+            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+            const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
 
-    if (x < src.cols)
-    {
-        const T* src_col = src.ptr() + x;
+            if (x < src.cols)
+            {
+                const T* src_col = src.ptr() + x;
 
-        //Main data
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
+                //Main data
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
 
-        //Upper halo
-        #pragma unroll
-        for(int i = 0; i < HALO_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
+                //Upper halo
+                #pragma unroll
+                for(int i = 0; i < HALO_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
 
-        //Lower halo
-        #pragma unroll
-        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
+                //Lower halo
+                #pragma unroll
+                for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
 
-        __syncthreads();
+                __syncthreads();
 
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-        {
-            sum_t sum = VecTraits<sum_t>::all(0);
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
 
-            #pragma unroll
-            for(int j = 0; j < KERNEL_SIZE; ++j)
-                sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
+                    #pragma unroll
+                    for(int j = 0; j < KERNEL_SIZE; ++j)
+                        sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
 
-            int dstY = y + i * BLOCK_DIM_Y;
+                    int dstY = y + i * BLOCK_DIM_Y;
 
-            if (dstY < src.rows)
-                dst.ptr(dstY)[x] = saturate_cast<D>(sum);
+                    if (dstY < src.rows)
+                        dst.ptr(dstY)[x] = saturate_cast<D>(sum);
+                }
+            }
         }
-    }
-}
 
-template <int ksize, typename T, typename D, template<typename> class B>
-void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-{        
-    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-    const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
+        template <int ksize, typename T, typename D, template<typename> class B>
+        void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+        {        
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
 
-    B<T> b(src.rows);
+            B<T> b(src.rows);
 
-    linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-    cudaSafeCall( cudaGetLastError() );
+            linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <typename T, typename D>
-void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-    static const caller_t callers[5][17] = 
-    {
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
-            linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<16, T, D, BrdColReflect101> 
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
-            linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<16, T, D, BrdColReplicate>
-        },
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
         {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<2 , T, D, BrdColConstant>,
-            linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<10, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<11, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<12, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<13, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<14, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<15, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<16, T, D, BrdColConstant> 
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReflect>,
-            linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<10, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<11, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<12, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<13, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<14, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<15, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<16, T, D, BrdColReflect>
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<2 , T, D, BrdColWrap>,
-            linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<10, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<11, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<12, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<13, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<14, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<15, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<16, T, D, BrdColWrap>,
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+            static const caller_t callers[5][17] = 
+            {
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReflect101> 
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReplicate>
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColConstant>,
+                    linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<10, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<11, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<12, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<13, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<14, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<15, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<16, T, D, BrdColConstant> 
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReflect>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReflect>
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColWrap>,
+                    linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<10, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<11, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<12, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<13, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<14, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<15, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
+                }
+            };
+            
+            loadKernel(kernel, ksize);
+
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
         }
-    };
-    
-    loadKernel(kernel, ksize);
-
-    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-}
-
-template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-
-} // namespace column_filter
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    } // namespace column_filter
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu
index aafcdf5..3397672 100644
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -43,87 +43,85 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-        dst.ptr(y)[x] = src(y - top, x - left);
-}
-
-template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
-{
-    static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
-        const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-    {        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-        BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-        copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-
-template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
-    const T* borderValue, cudaStream_t stream)
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename TypeVec<T, cn>::vec_type vec_type;
-
-    typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
-
-    static const caller_t callers[5] = 
+    namespace imgproc 
     {
-        CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
-    };
-
-    callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
-}
-
-template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-
-//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-
-template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-
-template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-
-//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-
-template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-
-} // namespace imgproc
-
-END_OPENCV_DEVICE_NAMESPACE
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }
+
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {        
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+            typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+            static const caller_t callers[5] = 
+            {
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
+            };
+
+            callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 7920cc8..59ea3e0 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -47,2034 +47,2024 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    //////////////////////////////////////////////////////////////////////////
+    // add
 
-//////////////////////////////////////////////////////////////////////////
-// add
+    template <typename T, typename D> struct Add : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a + b);
+        }
+    };
 
-template <typename T, typename D> struct Add : binary_function<T, T, D>
-{
-    __device__ __forceinline__ D operator ()(T a, T b) const
+    template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >
     {
-        return saturate_cast<D>(a + b);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);
+        else
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);
     }
-};
 
-template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    template <typename T, typename D> struct AddScalar : unary_function<T, D>
+    {
+        AddScalar(double val_) : val(val_) {}
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a + val);
+        }
+        const double val;
+    };
 
-template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
-{
-    if (mask.data)
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);
-    else
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);
-}
-
-template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-template <typename T, typename D> struct AddScalar : unary_function<T, D>
-{
-    AddScalar(double val_) : val(val_) {}
-    __device__ __forceinline__ D operator ()(T a) const
+    template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
     {
-        return saturate_cast<D>(a + val);
+        cudaSafeCall( cudaSetDoubleForDevice(&val) );
+        AddScalar<T, D> op(val);
+        if (mask.data)
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);
+        else
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
     }
-    const double val;
-};
 
-template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // subtract
+
+    template <typename T, typename D> struct Subtract : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a - b);
+        }
+    };
 
-template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&val) );
-    AddScalar<T, D> op(val);
-    if (mask.data)
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);
-    else
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//////////////////////////////////////////////////////////////////////////
-// subtract
-
-template <typename T, typename D> struct Subtract : binary_function<T, T, D>
-{
-    __device__ __forceinline__ D operator ()(T a, T b) const
+    template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
     {
-        return saturate_cast<D>(a - b);
+        if (mask.data)
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);
+        else
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);
     }
-};
 
-template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    template <typename T, typename D> struct SubtractScalar : unary_function<T, D>
+    {
+        SubtractScalar(double val_) : val(val_) {}
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a - val);
+        }
+        const double val;
+    };
 
-template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
-{
-    if (mask.data)
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);
-    else
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);
-}
-
-template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-template <typename T, typename D> struct SubtractScalar : unary_function<T, D>
-{
-    SubtractScalar(double val_) : val(val_) {}
-    __device__ __forceinline__ D operator ()(T a) const
+    template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >
     {
-        return saturate_cast<D>(a - val);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&val) );
+        SubtractScalar<T, D> op(val);
+        if (mask.data)
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);
+        else
+            ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
     }
-    const double val;
-};
 
-template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // multiply
+
+    struct multiply_8uc4_32f : binary_function<uint, float, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, float b) const
+        {
+            uint res = 0;
 
-template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&val) );
-    SubtractScalar<T, D> op(val);
-    if (mask.data)
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);
-    else
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-//template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-//////////////////////////////////////////////////////////////////////////
-// multiply
-
-struct multiply_8uc4_32f : binary_function<uint, float, uint>
-{
-    __device__ __forceinline__ uint operator ()(uint a, float b) const
+            res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
+            res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
+            res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+            res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+
+            return res;
+        }
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)
     {
-        uint res = 0;
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 8 };
+    };
+
+    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);
+    }
 
-        res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
-        res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
-        res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
-        res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+    struct multiply_16sc4_32f : binary_function<short4, float, short4>
+    {
+        __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+        {
+            return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
+                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
+        }
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 8 };
+    };
 
-        return res;
+    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);
     }
-};
 
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 8 };
-};
+    template <typename T, typename D> struct Multiply : binary_function<T, T, D>
+    {
+        Multiply(double scale_) : scale(scale_) {}
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(scale * a * b);
+        }
+        const double scale;
+    };
 
-void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
-{
-    transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);
-}
+    template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
-struct multiply_16sc4_32f : binary_function<short4, float, short4>
-{
-    __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+    template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)
     {
-        return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
-                           saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
+        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
+        Multiply<T, D> op(scale);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
     }
-};
 
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 8 };
-};
+    template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>
+    {
+        MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(scale * a * val);
+        }
+        const double val;
+        const double scale;
+    };
 
-void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)
-{
-    transform(static_cast< DevMem2D_<short4> >(src1), src2, 
-              static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);
-}
+    template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
-template <typename T, typename D> struct Multiply : binary_function<T, T, D>
-{
-    Multiply(double scale_) : scale(scale_) {}
-    __device__ __forceinline__ D operator ()(T a, T b) const
+    template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)
     {
-        return saturate_cast<D>(scale * a * b);
+        cudaSafeCall( cudaSetDoubleForDevice(&val) );
+        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
+        MultiplyScalar<T, D> op(val, scale);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
     }
-    const double scale;
-};
 
-template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // divide
+
+    struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>
+    {
+        __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
+        {
+            return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
+                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) 
+                          : make_uchar4(0,0,0,0);
+        }
+    };
 
-template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-    Multiply<T, D> op(scale);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>
-{
-    MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
-    __device__ __forceinline__ D operator ()(T a) const
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)
     {
-        return saturate_cast<D>(scale * a * val);
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 8 };
+    };
+
+    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
+    {
+        transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);
     }
-    const double val;
-    const double scale;
-};
 
-template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
 
-template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&val) );
-    cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-    MultiplyScalar<T, D> op(val, scale);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//////////////////////////////////////////////////////////////////////////
-// divide
-
-struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>
-{
-    __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
+    struct divide_16sc4_32f : binary_function<short4, float, short4>
+    {
+        __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+        {
+            return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),
+                                        saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))
+                          : make_short4(0,0,0,0);
+        }
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 8 };
+    };
+
+    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)
     {
-        return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                    saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) 
-                      : make_uchar4(0,0,0,0);
+        transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);
     }
-};
 
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 8 };
-};
+    template <typename T, typename D> struct Divide : binary_function<T, T, D>
+    {
+        Divide(double scale_) : scale(scale_) {}
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        }
+        const double scale;
+    };
 
-void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
-{
-    transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);
-}
+    template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
+    template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
+        Divide<T, D> op(scale);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
+    }
 
-struct divide_16sc4_32f : binary_function<short4, float, short4>
-{
-    __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+    template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    template <typename T, typename D> struct DivideScalar : unary_function<T, D>
+    {
+        DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(scale * a / val);
+        }
+        const double val;
+        const double scale;
+    };
+
+    template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)
     {
-        return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                    saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))
-                      : make_short4(0,0,0,0);
+        cudaSafeCall( cudaSetDoubleForDevice(&val) );
+        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
+        DivideScalar<T, D> op(val, scale);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
     }
-};
 
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 8 };
-};
+    template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    //template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    //template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+    template <typename T, typename D> struct Reciprocal : unary_function<T, D>
+    {
+        Reciprocal(double scale_) : scale(scale_) {}
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return a != 0 ? saturate_cast<D>(scale / a) : 0;
+        }
+        const double scale;
+    };
 
-void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)
-{
-    transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);
-}
+    template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
-template <typename T, typename D> struct Divide : binary_function<T, T, D>
-{
-    Divide(double scale_) : scale(scale_) {}
-    __device__ __forceinline__ D operator ()(T a, T b) const
+    template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        cudaSafeCall( cudaSetDoubleForDevice(&scalar) );
+        Reciprocal<T, D> op(scalar);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
     }
-    const double scale;
-};
 
-template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<uchar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<ushort, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<short, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<int, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<float, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<double, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // absdiff
+
+    template <typename T> struct Absdiff : binary_function<T, T, T>
+    {
+        static __device__ __forceinline__ int abs(int a)
+        {
+            return ::abs(a);
+        }
+        static __device__ __forceinline__ float abs(float a)
+        {
+            return ::fabsf(a);
+        }
+        static __device__ __forceinline__ double abs(double a)
+        {
+            return ::fabs(a);
+        }
 
-template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-    Divide<T, D> op(scale);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-template <typename T, typename D> struct DivideScalar : unary_function<T, D>
-{
-    DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}
-    __device__ __forceinline__ D operator ()(T a) const
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return saturate_cast<T>(::abs(a - b));
+        }
+    };
+
+    template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >
     {
-        return saturate_cast<D>(scale * a / val);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);
     }
-    const double val;
-    const double scale;
-};
 
-template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&val) );
-    cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-    DivideScalar<T, D> op(val, scale);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-//template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-//template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-
-template <typename T, typename D> struct Reciprocal : unary_function<T, D>
-{
-    Reciprocal(double scale_) : scale(scale_) {}
-    __device__ __forceinline__ D operator ()(T a) const
+    template <typename T> struct AbsdiffScalar : unary_function<T, T>
+    {
+        AbsdiffScalar(double val_) : val(val_) {}
+        __device__ __forceinline__ T operator ()(T a) const
+        {
+            return saturate_cast<T>(::fabs(a - val));
+        }
+        double val;
+    };
+
+    template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >
     {
-        return a != 0 ? saturate_cast<D>(scale / a) : 0;
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&val) );
+        AbsdiffScalar<T> op(val);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);
     }
-    const double scale;
-};
 
-template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
 
-template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&scalar) );
-    Reciprocal<T, D> op(scalar);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);
-}
-
-template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<uchar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<ushort, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<short, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<int, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<float, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<double, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//////////////////////////////////////////////////////////////////////////
-// absdiff
-
-template <typename T> struct Absdiff : binary_function<T, T, T>
-{
-    static __device__ __forceinline__ int abs(int a)
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Compare
+
+    template <typename T> struct Equal : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        {
+            return static_cast<uchar>((src1 == src2) * 255);
+        }
+    };
+    template <typename T> struct NotEqual : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        {
+            return static_cast<uchar>((src1 != src2) * 255);
+        }
+    };
+    template <typename T> struct Less : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        {
+            return static_cast<uchar>((src1 < src2) * 255);
+        }
+    };
+    template <typename T> struct LessEqual : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        {
+            return static_cast<uchar>((src1 <= src2) * 255);
+        }
+    };
+
+    template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >
     {
-        return ::abs(a);
-    }
-    static __device__ __forceinline__ float abs(float a)
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >
     {
-        return ::fabsf(a);
-    }
-    static __device__ __forceinline__ double abs(double a)
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >
     {
-        return ::fabs(a);
-    }
-
-    __device__ __forceinline__ T operator ()(T a, T b) const
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >
     {
-        return saturate_cast<T>(::abs(a - b));
-    }
-};
-
-template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
-template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);
-}
-
-//template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-//template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-template <typename T> struct AbsdiffScalar : unary_function<T, T>
-{
-    AbsdiffScalar(double val_) : val(val_) {}
-    __device__ __forceinline__ T operator ()(T a) const
+    template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return saturate_cast<T>(::fabs(a - val));
+        Op<T> op;
+        ::cv::gpu::device::transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
     }
-    double val;
-};
-
-template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
 
-template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&val) );
-    AbsdiffScalar<T> op(val);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);
-}
-
-template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
-template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
-//template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
-template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-
-//////////////////////////////////////////////////////////////////////////////////////
-// Compare
-
-template <typename T> struct Equal : binary_function<T, T, uchar>
-{
-    __device__ __forceinline__ uchar operator()(T src1, T src2) const
+    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return static_cast<uchar>((src1 == src2) * 255);
+        compare<Equal, T>(src1, src2, dst, stream);
     }
-};
-template <typename T> struct NotEqual : binary_function<T, T, uchar>
-{
-    __device__ __forceinline__ uchar operator()(T src1, T src2) const
+    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return static_cast<uchar>((src1 != src2) * 255);
+        compare<NotEqual, T>(src1, src2, dst, stream);
     }
-};
-template <typename T> struct Less : binary_function<T, T, uchar>
-{
-    __device__ __forceinline__ uchar operator()(T src1, T src2) const
+    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return static_cast<uchar>((src1 < src2) * 255);
+        compare<Less, T>(src1, src2, dst, stream);
     }
-};
-template <typename T> struct LessEqual : binary_function<T, T, uchar>
-{
-    __device__ __forceinline__ uchar operator()(T src1, T src2) const
+    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
     {
-        return static_cast<uchar>((src1 <= src2) * 255);
+        compare<LessEqual, T>(src1, src2, dst, stream);
     }
-};
-
-template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
 
-template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    Op<T> op;
-    OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
-}
+    template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // Unary bitwise logical matrix operations
+
+    enum { UN_OP_NOT };
+
+    template <typename T, int opid>
+    struct UnOp;
+
+    template <typename T>
+    struct UnOp<T, UN_OP_NOT>
+    { 
+        static __device__ __forceinline__ T call(T v) { return ~v; }
+    };
 
-template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    compare<Equal, T>(src1, src2, dst, stream);
-}
-template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    compare<NotEqual, T>(src1, src2, dst, stream);
-}
-template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    compare<Less, T>(src1, src2, dst, stream);
-}
-template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
-{
-    compare<LessEqual, T>(src1, src2, dst, stream);
-}
-
-template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-
-//////////////////////////////////////////////////////////////////////////
-// Unary bitwise logical matrix operations
-
-enum { UN_OP_NOT };
-
-template <typename T, int opid>
-struct UnOp;
-
-template <typename T>
-struct UnOp<T, UN_OP_NOT>
-{ 
-    static __device__ __forceinline__ T call(T v) { return ~v; }
-};
-
-
-template <int opid>
-__global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)
-{
-    const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (y < rows) 
+    template <int opid>
+    __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)
     {
-        uchar* dst_ptr = dst.ptr(y) + x;
-        const uchar* src_ptr = src.ptr(y) + x;
-        if (x + sizeof(uint) - 1 < width)
-        {
-            *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
-        }
-        else
+        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (y < rows) 
         {
-            const uchar* src_end = src.ptr(y) + width;
-            while (src_ptr < src_end)
+            uchar* dst_ptr = dst.ptr(y) + x;
+            const uchar* src_ptr = src.ptr(y) + x;
+            if (x + sizeof(uint) - 1 < width)
             {
-                *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
+                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
+            }
+            else
+            {
+                const uchar* src_end = src.ptr(y) + width;
+                while (src_ptr < src_end)
+                {
+                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
+                }
             }
         }
     }
-}
 
 
-template <int opid>
-void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, 
-                 cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(width, threads.x * sizeof(uint)), 
-              divUp(rows, threads.y));
-
-    bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
-    cudaSafeCall( cudaGetLastError() );
+    template <int opid>
+    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, 
+                     cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(width, threads.x * sizeof(uint)), 
+                  divUp(rows, threads.y));
 
-    if (stream == 0) 
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
+        cudaSafeCall( cudaGetLastError() );
 
+        if (stream == 0) 
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 
-template <typename T, int opid>
-__global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, 
-                                  const PtrStepb mask, PtrStepb dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+    template <typename T, int opid>
+    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, 
+                                      const PtrStepb mask, PtrStepb dst)
     {
-        T* dst_row = (T*)dst.ptr(y);
-        const T* src_row = (const T*)src.ptr(y);
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        dst_row[x] = UnOp<T, opid>::call(src_row[x]);
-    }
-}
+        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        {
+            T* dst_row = (T*)dst.ptr(y);
+            const T* src_row = (const T*)src.ptr(y);
 
+            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
+        }
+    }
 
-template <typename T, int opid>
-void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, 
-                 const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-    bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
-    cudaSafeCall( cudaGetLastError() );
+    template <typename T, int opid>
+    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, 
+                     const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-    if (stream == 0) 
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
+        cudaSafeCall( cudaGetLastError() );
 
+        if (stream == 0) 
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 
-void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, 
-                      const PtrStepb src, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
-}
 
+    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, 
+                          const PtrStepb src, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
+    }
 
-template <typename T>
-void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, 
-                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
-}
 
-template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template <typename T>
+    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, 
+                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
+    }
 
+    template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
-//////////////////////////////////////////////////////////////////////////
-// Binary bitwise logical matrix operations
 
-enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
+    //////////////////////////////////////////////////////////////////////////
+    // Binary bitwise logical matrix operations
 
-template <typename T, int opid>
-struct BinOp;
+    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
 
-template <typename T>
-struct BinOp<T, BIN_OP_OR>
-{ 
-    static __device__ __forceinline__ T call(T a, T b) { return a | b; } 
-};
+    template <typename T, int opid>
+    struct BinOp;
 
+    template <typename T>
+    struct BinOp<T, BIN_OP_OR>
+    { 
+        static __device__ __forceinline__ T call(T a, T b) { return a | b; } 
+    };
 
-template <typename T>
-struct BinOp<T, BIN_OP_AND>
-{ 
-    static __device__ __forceinline__ T call(T a, T b) { return a & b; } 
-};
 
-template <typename T>
-struct BinOp<T, BIN_OP_XOR>
-{ 
-    static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } 
-};
+    template <typename T>
+    struct BinOp<T, BIN_OP_AND>
+    { 
+        static __device__ __forceinline__ T call(T a, T b) { return a & b; } 
+    };
 
+    template <typename T>
+    struct BinOp<T, BIN_OP_XOR>
+    { 
+        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } 
+    };
 
-template <int opid>
-__global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, 
-                                   const PtrStepb src2, PtrStepb dst)
-{
-    const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (y < rows) 
+    template <int opid>
+    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, 
+                                       const PtrStepb src2, PtrStepb dst)
     {
-        uchar* dst_ptr = dst.ptr(y) + x;
-        const uchar* src1_ptr = src1.ptr(y) + x;
-        const uchar* src2_ptr = src2.ptr(y) + x;
+        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        if (x + sizeof(uint) - 1 < width)
+        if (y < rows) 
         {
-            *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
-        }
-        else
-        {
-            const uchar* src1_end = src1.ptr(y) + width;
-            while (src1_ptr < src1_end)
+            uchar* dst_ptr = dst.ptr(y) + x;
+            const uchar* src1_ptr = src1.ptr(y) + x;
+            const uchar* src2_ptr = src2.ptr(y) + x;
+
+            if (x + sizeof(uint) - 1 < width)
             {
-                *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
+                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
+            }
+            else
+            {
+                const uchar* src1_end = src1.ptr(y) + width;
+                while (src1_ptr < src1_end)
+                {
+                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
+                }
             }
         }
     }
-}
 
 
-template <int opid>
-void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, 
-                  PtrStepb dst, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
-
-    bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
-    cudaSafeCall( cudaGetLastError() );
+    template <int opid>
+    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, 
+                      PtrStepb dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
 
-    if (stream == 0) 
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
+        cudaSafeCall( cudaGetLastError() );
 
+        if (stream == 0) 
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 
-template <typename T, int opid>
-__global__ void bitwiseBinOpKernel(
-        int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
-        const PtrStepb mask, PtrStepb dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+    template <typename T, int opid>
+    __global__ void bitwiseBinOpKernel(
+            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+            const PtrStepb mask, PtrStepb dst)
     {
-        T* dst_row = (T*)dst.ptr(y);
-        const T* src1_row = (const T*)src1.ptr(y);
-        const T* src2_row = (const T*)src2.ptr(y);
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        {
+            T* dst_row = (T*)dst.ptr(y);
+            const T* src1_row = (const T*)src1.ptr(y);
+            const T* src2_row = (const T*)src2.ptr(y);
 
-        dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
+            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
+        }
     }
-}
 
 
-template <typename T, int opid>
-void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
-                    const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+    template <typename T, int opid>
+    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+                        const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-    bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
-    cudaSafeCall( cudaGetLastError() );
+        bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
+        cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0) 
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        if (stream == 0) 
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 
 
-void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
-                     const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-}
+    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+                         const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
+    }
 
 
-template <typename T>
-void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
-                         const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-}
+    template <typename T>
+    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+                             const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
 
-template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
 
-void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
-                      const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-}
+    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
+    }
 
 
-template <typename T>
-void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
-                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-}
+    template <typename T>
+    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
 
-template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
 
-void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
-                      const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-}
+    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
+    }
 
 
-template <typename T>
-void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
-                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-{
-    bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-}
+    template <typename T>
+    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
 
-template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
+    //////////////////////////////////////////////////////////////////////////
+    // min/max
 
-//////////////////////////////////////////////////////////////////////////
-// min/max
+    namespace detail
+    {
+        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
 
-namespace detail
-{
-    template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
     {
     };
-    template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
     {
-        enum { smart_shift = 4 };
     };
-    template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
     {
-        enum { smart_block_dim_y = 4 };
-        enum { smart_shift = 4 };
     };
-}
 
-template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
-{
-};
-template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
-{
-};
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(src1, src2, dst, minimum<T>(), stream);    
+    }
 
-template <typename T>
-void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, minimum<T>(), stream);    
-}
-
-template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
-template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-
-template <typename T>
-void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, maximum<T>(), stream);    
-}
-
-template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
-template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-
-template <typename T>
-void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    
-}
-
-template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
-template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
-template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-
-template <typename T>
-void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    
-}
+    template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(src1, src2, dst, maximum<T>(), stream);    
+    }
 
-template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
-template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
-template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    
+    }
 
+    template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
 
-//////////////////////////////////////////////////////////////////////////
-// threshold
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    
+    }
 
-namespace detail
-{
-    template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+    template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // threshold
+
+    namespace detail
+    {
+        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
     {
     };
-    template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
     {
-        enum { smart_shift = 4 };
     };
-    template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
     {
-        enum { smart_block_dim_y = 4 };
-        enum { smart_shift = 4 };
     };
-}
 
-template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
-{
-};
-template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
-{
-};
+    template <template <typename> class Op, typename T>
+    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
+        cudaStream_t stream)
+    {
+        Op<T> op(thresh, maxVal);
+        ::cv::gpu::device::transform(src, dst, op, stream);
+    }
 
-template <template <typename> class Op, typename T>
-void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
-    cudaStream_t stream)
-{
-    Op<T> op(thresh, maxVal);
-    OPENCV_DEVICE_NAMESPACE_ transform(src, dst, op, stream);
-}
+    template <typename T>
+    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,
+        cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
+            cudaStream_t stream);
 
-template <typename T>
-void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,
-    cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
-        cudaStream_t stream);
+        static const caller_t callers[] = 
+        {
+            threshold_caller<thresh_binary_func, T>, 
+            threshold_caller<thresh_binary_inv_func, T>, 
+            threshold_caller<thresh_trunc_func, T>, 
+            threshold_caller<thresh_to_zero_func, T>, 
+            threshold_caller<thresh_to_zero_inv_func, T>
+        };
+
+        callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
+    }
 
-    static const caller_t callers[] = 
-    {
-        threshold_caller<thresh_binary_func, T>, 
-        threshold_caller<thresh_binary_inv_func, T>, 
-        threshold_caller<thresh_trunc_func, T>, 
-        threshold_caller<thresh_to_zero_func, T>, 
-        threshold_caller<thresh_to_zero_inv_func, T>
+    template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////////////////
+    // pow
+
+    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+    {    
+        float power;
+        PowOp(float power_) : power(power_) {}
+            
+        __device__ __forceinline__ T operator()(const T& e) const
+        {      
+            return saturate_cast<T>(__powf((float)e, power));
+        }      
     };
 
-    callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
-}
+    template<typename T> struct PowOp<T, true> : unary_function<T, T>
+    {
+        float power;
+        PowOp(float power_) : power(power_) {}
+
+        __device__ __forceinline__ float operator()(const T& e) const
+        {
+            T res = saturate_cast<T>(__powf((float)e, power));            
+            
+            if ( (e < 0) && (1 & (int)power) )
+                    res *= -1;            
+            return res;         
+        }
+    };
 
-template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);
-template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template<> struct PowOp<float> : unary_function<float, float>
+    {
+        float power;
+        PowOp(float power_) : power(power_) {}
 
+        __device__ __forceinline__ float operator()(const float& e) const
+        {
+            return __powf(::fabs(e), power);
+        }
+    };
 
+    namespace detail
+    {
+        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+        };
+        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 8 };
+        };
+        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
 
+    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
+    {
+    };
 
-//////////////////////////////////////////////////////////////////////////
-// pow
+    template<typename T>
+    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
+    {
+        ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);
+    }   
 
-template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
-{    
-    float power;
-    PowOp(float power_) : power(power_) {}
-        
-    __device__ __forceinline__ T operator()(const T& e) const
-    {      
-        return saturate_cast<T>(__powf((float)e, power));
-    }      
-};
+    template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
 
-template<typename T> struct PowOp<T, true> : unary_function<T, T>
-{
-    float power;
-    PowOp(float power_) : power(power_) {}
+    //////////////////////////////////////////////////////////////////////////
+    // addWeighted
 
-    __device__ __forceinline__ float operator()(const T& e) const
+    template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>
     {
-        T res = saturate_cast<T>(__powf((float)e, power));            
-        
-        if ( (e < 0) && (1 & (int)power) )
-                res *= -1;            
-        return res;         
-    }
-};
+        __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
 
-template<> struct PowOp<float> : unary_function<float, float>
-{
-    float power;
-    PowOp(float power_) : power(power_) {}
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const
+        {
+            return saturate_cast<D>(alpha * a + beta * b + gamma);
+        }
+
+        const double alpha;
+        const double beta;
+        const double gamma;
+    };
 
-    __device__ __forceinline__ float operator()(const float& e) const
+    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
     {
-        return __powf(::fabs(e), power);
-    }
-};
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >
+    {
+        enum { smart_shift = 4 };
+    };
 
-namespace detail
-{
-    template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
+    template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >
     {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
     };
-    template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
+    template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >
     {
         enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
+        enum { smart_shift = 4 };
     };
-    template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
+    template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >
     {
+        enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
+    template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >
     {
-        enum { smart_block_dim_y = 4 };
+        enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-}
-
-template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
-{
-};
-
-template<typename T>
-void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
-{
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);
-}   
-
-template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-
-
-
-
-//////////////////////////////////////////////////////////////////////////
-// addWeighted
-
-template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>
-{
-    __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
-
-    __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const
+    template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
     {
-        return saturate_cast<D>(alpha * a + beta * b + gamma);
-    }
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
 
-    const double alpha;
-    const double beta;
-    const double gamma;
-};
+    template <typename T1, typename T2, typename D>
+    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+        cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
 
-template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
-{
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >
-{
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >
-{
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >
-{
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >
-{
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >
-{
-    enum { smart_shift = 4 };
-};
+        AddWeighted<T1, T2, D> op(alpha, beta, gamma);
 
-template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+        ::cv::gpu::device::transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);
+    }
 
-template <typename T1, typename T2, typename D>
-void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-    cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-    cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
-
-    AddWeighted<T1, T2, D> op(alpha, beta, gamma);
-
-    OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);
-}
-
-template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-
-
-template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-END_OPENCV_DEVICE_NAMESPACE
+    template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+    template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+
+
+    template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+    template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index 52d6154..e2ddf70 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -45,177 +45,175 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define UINT_BITS 32U
-
-//Warps == subhistograms per threadblock
-#define WARP_COUNT 6
-
-//Threadblock size
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-#define HISTOGRAM256_BIN_COUNT 256
-
-//Shared memory per threadblock
-#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-#define PARTIAL_HISTOGRAM256_COUNT 240
-
-#define MERGE_THREADBLOCK_SIZE 256
-
-#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
+namespace cv { namespace gpu { namespace device 
+{
+    #define UINT_BITS 32U
 
-namespace hist {
+    //Warps == subhistograms per threadblock
+    #define WARP_COUNT 6
 
-#if (!USE_SMEM_ATOMICS)
+    //Threadblock size
+    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
+    #define HISTOGRAM256_BIN_COUNT 256
 
-    #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+    //Shared memory per threadblock
+    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
 
-    __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-    {
-        uint count;
-        do
-        {
-            count = s_WarpHist[data] & TAG_MASK;
-            count = threadTag | (count + 1);
-            s_WarpHist[data] = count;
-        } while (s_WarpHist[data] != count);
-    }
+    #define PARTIAL_HISTOGRAM256_COUNT 240
 
-#else
+    #define MERGE_THREADBLOCK_SIZE 256
 
-    #define TAG_MASK 0xFFFFFFFFU
+    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
 
-    __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+    namespace hist 
     {
-        atomicAdd(s_WarpHist + data, 1);
-    }
-
-#endif
-
-__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-{
-    uint x = pos_x << 2;
-
-    if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-    if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-    if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-    if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-}
-
-__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-{
-    //Per-warp subhistogram storage
-    __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-    uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+        #if (!USE_SMEM_ATOMICS)
 
-    //Clear shared memory storage for current threadblock before processing
-    #pragma unroll
-    for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-       s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
+            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
 
-    //Cycle through the entire data set, update subhistograms for each warp
-    const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
+            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+            {
+                uint count;
+                do
+                {
+                    count = s_WarpHist[data] & TAG_MASK;
+                    count = threadTag | (count + 1);
+                    s_WarpHist[data] = count;
+                } while (s_WarpHist[data] != count);
+            }
 
-    __syncthreads();
-    const uint colsui = d_Data.step / sizeof(uint);
-    for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-    {
-        uint pos_y = pos / colsui;
-        uint pos_x = pos % colsui;
-        uint data = d_Data.ptr(pos_y)[pos_x];
-        addWord(s_WarpHist, data, tag, pos_x, cols);
-    }
-
-    //Merge per-warp histograms into per-block and write to global memory
-    __syncthreads();
-    for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-    {
-        uint sum = 0;
-
-        for (uint i = 0; i < WARP_COUNT; i++)
-            sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-        d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-    }
-}
+        #else
 
-////////////////////////////////////////////////////////////////////////////////
-// Merge histogram256() output
-// Run one threadblock per bin; each threadblock adds up the same bin counter
-// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-// takes only a fraction of total processing time
-////////////////////////////////////////////////////////////////////////////////
+            #define TAG_MASK 0xFFFFFFFFU
 
-__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-{
-    uint sum = 0;
+            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+            {
+                atomicAdd(s_WarpHist + data, 1);
+            }
 
-    #pragma unroll
-    for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-        sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+        #endif
 
-    __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-    data[threadIdx.x] = sum;
+        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
+        {
+            uint x = pos_x << 2;
 
-    for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-    {
-        __syncthreads();
-        if(threadIdx.x < stride)
-            data[threadIdx.x] += data[threadIdx.x + stride];
-    }
+            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+        }
 
-    if(threadIdx.x == 0)
-        d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-}
+        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+        {
+            //Per-warp subhistogram storage
+            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
+            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+
+            //Clear shared memory storage for current threadblock before processing
+            #pragma unroll
+            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
+               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
+
+            //Cycle through the entire data set, update subhistograms for each warp
+            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
+
+            __syncthreads();
+            const uint colsui = d_Data.step / sizeof(uint);
+            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
+            {
+                uint pos_y = pos / colsui;
+                uint pos_x = pos % colsui;
+                uint data = d_Data.ptr(pos_y)[pos_x];
+                addWord(s_WarpHist, data, tag, pos_x, cols);
+            }
+
+            //Merge per-warp histograms into per-block and write to global memory
+            __syncthreads();
+            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
+            {
+                uint sum = 0;
+
+                for (uint i = 0; i < WARP_COUNT; i++)
+                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
+
+                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
+            }
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // Merge histogram256() output
+        // Run one threadblock per bin; each threadblock adds up the same bin counter
+        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+        // takes only a fraction of total processing time
+        ////////////////////////////////////////////////////////////////////////////////
+
+        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+        {
+            uint sum = 0;
 
-void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
-{
-    histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-        DevMem2D_<uint>(src),
-        buf, 
-        static_cast<uint>(src.rows * src.step / sizeof(uint)),
-        src.cols);
+            #pragma unroll
+            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
+                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
 
-    cudaSafeCall( cudaGetLastError() );
+            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
+            data[threadIdx.x] = sum;
 
-    mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
+            {
+                __syncthreads();
+                if(threadIdx.x < stride)
+                    data[threadIdx.x] += data[threadIdx.x + stride];
+            }
 
-    cudaSafeCall( cudaGetLastError() );
+            if(threadIdx.x == 0)
+                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+        {
+            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+                DevMem2D_<uint>(src),
+                buf, 
+                static_cast<uint>(src.rows * src.step / sizeof(uint)),
+                src.cols);
 
-__constant__ int c_lut[256];
+            cudaSafeCall( cudaGetLastError() );
 
-__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
 
-    if (x < src.cols && y < src.rows)
-    {
-        const uchar val = src.ptr(y)[x];
-        const int lut = c_lut[val];
-        dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-    }
-}
+            cudaSafeCall( cudaGetLastError() );
 
-void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
-{
-    dim3 block(16, 16);
-    dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        __constant__ int c_lut[256];
 
-    equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-    cudaSafeCall( cudaGetLastError() );
+        __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows)
+            {
+                const uchar val = src.ptr(y)[x];
+                const int lut = c_lut[val];
+                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            }
+        }
+
+        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+        {
+            dim3 block(16, 16);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
 
-} // namespace hist
+            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
 
-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace hist
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index db43d74..6acee1d 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -42,734 +42,732 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-// Other values are not supported
-#define CELL_WIDTH 8
-#define CELL_HEIGHT 8
-#define CELLS_PER_BLOCK_X 2
-#define CELLS_PER_BLOCK_Y 2
-
-namespace hog {
-
-__constant__ int cnbins;
-__constant__ int cblock_stride_x;
-__constant__ int cblock_stride_y;
-__constant__ int cnblocks_win_x;
-__constant__ int cnblocks_win_y;
-__constant__ int cblock_hist_size;
-__constant__ int cblock_hist_size_2up;
-__constant__ int cdescr_size;
-__constant__ int cdescr_width;
-
-
-/* Returns the nearest upper power of two, works only for 
-the typical GPU thread count (pert block) values */
-int power_2up(unsigned int n)
+namespace cv { namespace gpu { namespace device 
 {
-    if (n < 1) return 1;
-    else if (n < 2) return 2;
-    else if (n < 4) return 4;
-    else if (n < 8) return 8;
-    else if (n < 16) return 16;
-    else if (n < 32) return 32;
-    else if (n < 64) return 64;
-    else if (n < 128) return 128;
-    else if (n < 256) return 256;
-    else if (n < 512) return 512;
-    else if (n < 1024) return 1024;
-    return -1; // Input is too big
-}
-
-
-void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
-                      int nblocks_win_x, int nblocks_win_y)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 
-    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 
-    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 
-    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  
-    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 
-
-    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 
-    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 
-
-    int block_hist_size_2up = power_2up(block_hist_size);  
-    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
-
-    int descr_width = nblocks_win_x * block_hist_size;
-    cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
-
-    int descr_size = descr_width * nblocks_win_y;
-    cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
-}
-
-
-//----------------------------------------------------------------------------
-// Histogram computation
-
-
-template <int nblocks> // Number of histogram blocks processed by single GPU thread block
-__global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, 
-                                                 const PtrElemStep qangle, float scale, float* block_hists)
-{
-    const int block_x = threadIdx.z;
-    const int cell_x = threadIdx.x / 16;
-    const int cell_y = threadIdx.y;
-    const int cell_thread_x = threadIdx.x & 0xF;
-
-    if (blockIdx.x * blockDim.z + block_x >= img_block_width)
-        return;
-
-    extern __shared__ float smem[];
-    float* hists = smem;
-    float* final_hist = smem + cnbins * 48 * nblocks;
-
-    const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 
-                         4 * cell_x + cell_thread_x;
-    const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
+    // Other values are not supported
+    #define CELL_WIDTH 8
+    #define CELL_HEIGHT 8
+    #define CELLS_PER_BLOCK_X 2
+    #define CELLS_PER_BLOCK_Y 2
 
-    const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
-    const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
-
-    // 12 means that 12 pixels affect on block's cell (in one row)
-    if (cell_thread_x < 12)
+    namespace hog 
     {
-        float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 
-                                    cell_x + block_x * CELLS_PER_BLOCK_X) + 
-                                   cell_thread_x;
-        for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-            hist[bin_id * 48 * nblocks] = 0.f;
+        __constant__ int cnbins;
+        __constant__ int cblock_stride_x;
+        __constant__ int cblock_stride_y;
+        __constant__ int cnblocks_win_x;
+        __constant__ int cnblocks_win_y;
+        __constant__ int cblock_hist_size;
+        __constant__ int cblock_hist_size_2up;
+        __constant__ int cdescr_size;
+        __constant__ int cdescr_width;
+
+
+        /* Returns the nearest upper power of two, works only for 
+        the typical GPU thread count (pert block) values */
+        int power_2up(unsigned int n)
+        {
+            if (n < 1) return 1;
+            else if (n < 2) return 2;
+            else if (n < 4) return 4;
+            else if (n < 8) return 8;
+            else if (n < 16) return 16;
+            else if (n < 32) return 32;
+            else if (n < 64) return 64;
+            else if (n < 128) return 128;
+            else if (n < 256) return 256;
+            else if (n < 512) return 512;
+            else if (n < 1024) return 1024;
+            return -1; // Input is too big
+        }
 
-        const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
 
-        const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
-        for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
+                              int nblocks_win_x, int nblocks_win_y)
         {
-            float2 vote = *(const float2*)grad_ptr;
-            uchar2 bin = *(const uchar2*)qangle_ptr;
+            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 
 
-            grad_ptr += grad.step;
-            qangle_ptr += qangle.step;
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 
 
-            int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
-            int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+            int block_hist_size_2up = power_2up(block_hist_size);  
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
 
-            float gaussian = ::expf(-(dist_center_y * dist_center_y + 
-                                      dist_center_x * dist_center_x) * scale);
-            float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 
-                                  (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
+            int descr_width = nblocks_win_x * block_hist_size;
+            cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
 
-            hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
-            hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
-        }
-
-        volatile float* hist_ = hist;
-        for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
-        {
-            if (cell_thread_x < 6) hist_[0] += hist_[6];
-            if (cell_thread_x < 3) hist_[0] += hist_[3];
-            if (cell_thread_x == 0) 
-                final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 
-                    = hist_[0] + hist_[1] + hist_[2];
+            int descr_size = descr_width * nblocks_win_y;
+            cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
         }
-    }
 
-    __syncthreads();
 
-    float* block_hist = block_hists + (blockIdx.y * img_block_width + 
-                                       blockIdx.x * blockDim.z + block_x) * 
-                                      cblock_hist_size;        
+        //----------------------------------------------------------------------------
+        // Histogram computation
 
-    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
-    if (tid < cblock_hist_size)
-        block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];     
-}
 
+        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, 
+                                                         const PtrElemStep qangle, float scale, float* block_hists)
+        {
+            const int block_x = threadIdx.z;
+            const int cell_x = threadIdx.x / 16;
+            const int cell_y = threadIdx.y;
+            const int cell_thread_x = threadIdx.x & 0xF;
+
+            if (blockIdx.x * blockDim.z + block_x >= img_block_width)
+                return;
+
+            extern __shared__ float smem[];
+            float* hists = smem;
+            float* final_hist = smem + cnbins * 48 * nblocks;
+
+            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 
+                                 4 * cell_x + cell_thread_x;
+            const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
+
+            const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
+            const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
+
+            // 12 means that 12 pixels affect on block's cell (in one row)
+            if (cell_thread_x < 12)
+            {
+                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 
+                                            cell_x + block_x * CELLS_PER_BLOCK_X) + 
+                                           cell_thread_x;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+                    hist[bin_id * 48 * nblocks] = 0.f;
+
+                const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
+
+                const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
+                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+                {
+                    float2 vote = *(const float2*)grad_ptr;
+                    uchar2 bin = *(const uchar2*)qangle_ptr;
+
+                    grad_ptr += grad.step;
+                    qangle_ptr += qangle.step;
+
+                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+                    float gaussian = ::expf(-(dist_center_y * dist_center_y + 
+                                              dist_center_x * dist_center_x) * scale);
+                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 
+                                          (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
+
+                    hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
+                    hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
+                }
+
+                volatile float* hist_ = hist;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
+                {
+                    if (cell_thread_x < 6) hist_[0] += hist_[6];
+                    if (cell_thread_x < 3) hist_[0] += hist_[3];
+                    if (cell_thread_x == 0) 
+                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 
+                            = hist_[0] + hist_[1] + hist_[2];
+                }
+            }
+
+            __syncthreads();
+
+            float* block_hist = block_hists + (blockIdx.y * img_block_width + 
+                                               blockIdx.x * blockDim.z + block_x) * 
+                                              cblock_hist_size;        
+
+            int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+            if (tid < cblock_hist_size)
+                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];     
+        }
 
-void compute_hists(int nbins, int block_stride_x, int block_stride_y, 
-                   int height, int width, const DevMem2Df& grad, 
-                   const DevMem2Db& qangle, float sigma, float* block_hists)                             
-{
-    const int nblocks = 1;
-
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
-                          block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
-                           block_stride_y;
-
-    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
-    dim3 threads(32, 2, nblocks);
 
-    cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
-                                        cudaFuncCachePreferL1));
- 
-    // Precompute gaussian spatial window parameter
-    float scale = 1.f / (2.f * sigma * sigma);
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y, 
+                           int height, int width, const DevMem2Df& grad, 
+                           const DevMem2Db& qangle, float sigma, float* block_hists)                             
+        {
+            const int nblocks = 1;
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+                                  block_stride_x;
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
+                                   block_stride_y;
+
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+            dim3 threads(32, 2, nblocks);
+
+            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
+                                                cudaFuncCachePreferL1));
+         
+            // Precompute gaussian spatial window parameter
+            float scale = 1.f / (2.f * sigma * sigma);
+
+            int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
+            int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
+            int smem = hists_size + final_hists_size;
+            compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
+                img_block_width, grad, qangle, scale, block_hists);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
-    int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
-    int smem = hists_size + final_hists_size;
-    compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
-        img_block_width, grad, qangle, scale, block_hists);
-    cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //-------------------------------------------------------------
+        //  Normalization of histograms via L2Hys_norm
+        //
+
+
+        template<int size> 
+        __device__ float reduce_smem(volatile float* smem)
+        {        
+            unsigned int tid = threadIdx.x;
+            float sum = smem[tid];
+
+            if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
+            if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
+            if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
+            
+            if (tid < 32)
+            {        
+                if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+                if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
+                if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
+                if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
+                if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
+                if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+            }
+
+            __syncthreads();
+            sum = smem[0];
+            
+            return sum;
+        }
 
 
-//-------------------------------------------------------------
-//  Normalization of histograms via L2Hys_norm
-//
+        template <int nthreads, // Number of threads which process one block historgam 
+                  int nblocks> // Number of block hisograms processed by one GPU thread block
+        __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
+                                                           const int img_block_width, 
+                                                           float* block_hists, float threshold)
+        {
+            if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
+                return;
+
+            float* hist = block_hists + (blockIdx.y * img_block_width + 
+                                         blockIdx.x * blockDim.z + threadIdx.z) * 
+                                        block_hist_size + threadIdx.x;
+            
+            __shared__ float sh_squares[nthreads * nblocks];
+            float* squares = sh_squares + threadIdx.z * nthreads;
+            
+            float elem = 0.f;
+            if (threadIdx.x < block_hist_size)
+                elem = hist[0];
+            
+            squares[threadIdx.x] = elem * elem;        
+
+            __syncthreads();
+            float sum = reduce_smem<nthreads>(squares);
+            
+            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        
+            elem = ::min(elem * scale, threshold);
+            
+            __syncthreads();
+            squares[threadIdx.x] = elem * elem;
+
+            __syncthreads();
+            sum = reduce_smem<nthreads>(squares);
+            scale = 1.0f / (::sqrtf(sum) + 1e-3f);
+            
+            if (threadIdx.x < block_hist_size)
+                hist[0] = elem * scale;
+        }
 
 
-template<int size> 
-__device__ float reduce_smem(volatile float* smem)
-{        
-    unsigned int tid = threadIdx.x;
-    float sum = smem[tid];
-
-    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
-    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
-    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
-    
-    if (tid < 32)
-    {        
-        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
-    }
-
-    __syncthreads();
-    sum = smem[0];
-    
-    return sum;
-}
-
-
-template <int nthreads, // Number of threads which process one block historgam 
-          int nblocks> // Number of block hisograms processed by one GPU thread block
-__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
-                                                   const int img_block_width, 
-                                                   float* block_hists, float threshold)
-{
-    if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
-        return;
-
-    float* hist = block_hists + (blockIdx.y * img_block_width + 
-                                 blockIdx.x * blockDim.z + threadIdx.z) * 
-                                block_hist_size + threadIdx.x;
-    
-    __shared__ float sh_squares[nthreads * nblocks];
-    float* squares = sh_squares + threadIdx.z * nthreads;
-    
-    float elem = 0.f;
-    if (threadIdx.x < block_hist_size)
-        elem = hist[0];
-    
-    squares[threadIdx.x] = elem * elem;        
-
-    __syncthreads();
-    float sum = reduce_smem<nthreads>(squares);
-    
-    float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        
-    elem = ::min(elem * scale, threshold);
-    
-    __syncthreads();
-    squares[threadIdx.x] = elem * elem;
-
-    __syncthreads();
-    sum = reduce_smem<nthreads>(squares);
-    scale = 1.0f / (::sqrtf(sum) + 1e-3f);
-    
-    if (threadIdx.x < block_hist_size)
-        hist[0] = elem * scale;
-}
-
-
-void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
-                     int height, int width, float* block_hists, float threshold)
-{   
-    const int nblocks = 1;
-
-    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
-    int nthreads = power_2up(block_hist_size);
-    dim3 threads(nthreads, 1, nblocks);
-
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
-
-    if (nthreads == 32)
-        normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    else if (nthreads == 64)
-        normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    else if (nthreads == 128)
-        normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    else if (nthreads == 256)
-        normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    else if (nthreads == 512)
-        normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    else
-        cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
-
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-
-//---------------------------------------------------------------------
-//  Linear SVM based classification
-//
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
+                             int height, int width, float* block_hists, float threshold)
+        {   
+            const int nblocks = 1;
+
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int nthreads = power_2up(block_hist_size);
+            dim3 threads(nthreads, 1, nblocks);
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+
+            if (nthreads == 32)
+                normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 64)
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 128)
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 256)
+                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 512)
+                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else
+                cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
 
+            cudaSafeCall( cudaGetLastError() );
 
-template <int nthreads, // Number of threads per one histogram block 
-          int nblocks> // Number of histogram block processed by single GPU thread block
-__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 
-                                                  const int win_block_stride_x, const int win_block_stride_y,
-                                                  const float* block_hists, const float* coefs,
-                                                  float free_coef, float threshold, unsigned char* labels)
-{            
-    const int win_x = threadIdx.z;
-    if (blockIdx.x * blockDim.z + win_x >= img_win_width)
-        return;
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
-                                       blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 
-                                      cblock_hist_size;
 
-    float product = 0.f;
-    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
-    {
-        int offset_y = i / cdescr_width;
-        int offset_x = i - offset_y * cdescr_width;
-        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
-    }
-
-    __shared__ float products[nthreads * nblocks];
-
-    const int tid = threadIdx.z * nthreads + threadIdx.x;
-    products[tid] = product;
-
-    __syncthreads();
-
-    if (nthreads >= 512) 
-    { 
-        if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-        __syncthreads(); 
-    }
-    if (nthreads >= 256) 
-    { 
-        if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; 
-        __syncthreads(); 
-    }
-    if (nthreads >= 128) 
-    { 
-        if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; 
-        __syncthreads(); 
-    }
-    
-    if (threadIdx.x < 32)
-    {        
-        volatile float* smem = products;
-        if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
-        if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
-        if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
-        if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
-        if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
-        if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
-    }
-
-    if (threadIdx.x == 0)
-        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
-}
-
-
-void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                    float* coefs, float free_coef, float threshold, unsigned char* labels)
-{   
-    const int nthreads = 256;
-    const int nblocks = 1;
-
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-
-    dim3 threads(nthreads, 1, nblocks);
-    dim3 grid(divUp(img_win_width, nblocks), img_win_height);
-
-    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
-
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
-        img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
-        block_hists, coefs, free_coef, threshold, labels);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-//----------------------------------------------------------------------------
-// Extract descriptors
-
-
-template <int nthreads>
-__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
-											  const float* block_hists, PtrElemStepf descriptors)
-{
-    // Get left top corner of the window in src
-    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
-                                       blockIdx.x * win_block_stride_x) * cblock_hist_size;
+        //---------------------------------------------------------------------
+        //  Linear SVM based classification
+        //
+
+
+        template <int nthreads, // Number of threads per one histogram block 
+                  int nblocks> // Number of histogram block processed by single GPU thread block
+        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 
+                                                          const int win_block_stride_x, const int win_block_stride_y,
+                                                          const float* block_hists, const float* coefs,
+                                                          float free_coef, float threshold, unsigned char* labels)
+        {            
+            const int win_x = threadIdx.z;
+            if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                return;
+
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 
+                                              cblock_hist_size;
+
+            float product = 0.f;
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+
+            __shared__ float products[nthreads * nblocks];
+
+            const int tid = threadIdx.z * nthreads + threadIdx.x;
+            products[tid] = product;
+
+            __syncthreads();
+
+            if (nthreads >= 512) 
+            { 
+                if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
+                __syncthreads(); 
+            }
+            if (nthreads >= 256) 
+            { 
+                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; 
+                __syncthreads(); 
+            }
+            if (nthreads >= 128) 
+            { 
+                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; 
+                __syncthreads(); 
+            }
+            
+            if (threadIdx.x < 32)
+            {        
+                volatile float* smem = products;
+                if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
+                if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
+                if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
+                if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
+                if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
+                if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
+            }
+
+            if (threadIdx.x == 0)
+                labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
+        }
 
-    // Get left top corner of the window in dst
-    float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
 
-    // Copy elements from src to dst
-    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
-    {
-        int offset_y = i / cdescr_width;
-        int offset_x = i - offset_y * cdescr_width;
-        descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
-    }
-}
+        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                            float* coefs, float free_coef, float threshold, unsigned char* labels)
+        {   
+            const int nthreads = 256;
+            const int nblocks = 1;
 
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
 
-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
-							int height, int width, float* block_hists, DevMem2Df descriptors)
-{
-    const int nthreads = 256;
+            dim3 threads(nthreads, 1, nblocks);
+            dim3 grid(divUp(img_win_width, nblocks), img_win_height);
 
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    dim3 threads(nthreads, 1);
-    dim3 grid(img_win_width, img_win_height);
+            cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
-        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
-    cudaSafeCall( cudaGetLastError() );
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
+                block_hists, coefs, free_coef, threshold, labels);
+            cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
+        //----------------------------------------------------------------------------
+        // Extract descriptors
 
-template <int nthreads>
-__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
-                                              const int win_block_stride_y, const float* block_hists, 
-                                              PtrElemStepf descriptors)
-{
-    // Get left top corner of the window in src
-    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
-                                       blockIdx.x * win_block_stride_x) * cblock_hist_size;
 
-    // Get left top corner of the window in dst
-    float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+        template <int nthreads>
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
+											          const float* block_hists, PtrElemStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+        }
 
-    // Copy elements from src to dst
-    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
-    {
-        int block_idx = i / cblock_hist_size;
-        int idx_in_block = i - block_idx * cblock_hist_size;
 
-        int y = block_idx / cnblocks_win_x;
-        int x = block_idx - y * cnblocks_win_x;
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
+							        int height, int width, float* block_hists, DevMem2Df descriptors)
+        {
+            const int nthreads = 256;
 
-        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
-            = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
-    }
-}
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
 
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
+                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+            cudaSafeCall( cudaGetLastError() );
 
-void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                            DevMem2Df descriptors)
-{
-    const int nthreads = 256;
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    dim3 threads(nthreads, 1);
-    dim3 grid(img_win_width, img_win_height);
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
-        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
-    cudaSafeCall( cudaGetLastError() );
+        template <int nthreads>
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
+                                                      const int win_block_stride_y, const float* block_hists, 
+                                                      PtrElemStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
 
-//----------------------------------------------------------------------------
-// Gradients computation
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int block_idx = i / cblock_hist_size;
+                int idx_in_block = i - block_idx * cblock_hist_size;
 
+                int y = block_idx / cnblocks_win_x;
+                int x = block_idx - y * cnblocks_win_x;
 
-template <int nthreads, int correct_gamma>
-__global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, 
-                                              float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
+                    = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+            }
+        }
 
-    const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
 
-    __shared__ float sh_row[(nthreads + 2) * 3];
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                                    DevMem2Df descriptors)
+        {
+            const int nthreads = 256;
 
-    uchar4 val;
-    if (x < width) 
-        val = row[x]; 
-    else 
-        val = row[width - 2];
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
 
-    sh_row[threadIdx.x + 1] = val.x;
-    sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
-    sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
+                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (threadIdx.x == 0)
-    {
-        val = row[::max(x - 1, 1)];
-        sh_row[0] = val.x;
-        sh_row[(nthreads + 2)] = val.y;
-        sh_row[2 * (nthreads + 2)] = val.z;
-    }
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (threadIdx.x == blockDim.x - 1)
-    {
-        val = row[::min(x + 1, width - 2)];
-        sh_row[blockDim.x + 1] = val.x;
-        sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
-        sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
-    }
-
-    __syncthreads();
-    if (x < width)
-    {
-        float3 a, b;
+        //----------------------------------------------------------------------------
+        // Gradients computation
 
-        b.x = sh_row[threadIdx.x + 2];
-        b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
-        b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
-        a.x = sh_row[threadIdx.x];
-        a.y = sh_row[threadIdx.x + (nthreads + 2)];
-        a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
 
-        float3 dx;
-        if (correct_gamma)
-            dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    
-        else
-            dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, 
+                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[(nthreads + 2) * 3];
+
+            uchar4 val;
+            if (x < width) 
+                val = row[x]; 
+            else 
+                val = row[width - 2];
+
+            sh_row[threadIdx.x + 1] = val.x;
+            sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
+            sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
+
+            if (threadIdx.x == 0)
+            {
+                val = row[::max(x - 1, 1)];
+                sh_row[0] = val.x;
+                sh_row[(nthreads + 2)] = val.y;
+                sh_row[2 * (nthreads + 2)] = val.z;
+            }
+
+            if (threadIdx.x == blockDim.x - 1)
+            {
+                val = row[::min(x + 1, width - 2)];
+                sh_row[blockDim.x + 1] = val.x;
+                sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
+                sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
+            }
+
+            __syncthreads();
+            if (x < width)
+            {
+                float3 a, b;
+
+                b.x = sh_row[threadIdx.x + 2];
+                b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
+                b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
+                a.x = sh_row[threadIdx.x];
+                a.y = sh_row[threadIdx.x + (nthreads + 2)];
+                a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
+
+                float3 dx;
+                if (correct_gamma)
+                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    
+                else
+                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    
+
+                float3 dy = make_float3(0.f, 0.f, 0.f);
+
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
+                    a = make_float3(val.x, val.y, val.z);
+
+                    val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
+                    b = make_float3(val.x, val.y, val.z);
+
+                    if (correct_gamma)
+                        dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                    else
+                        dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
+                }
+
+                float best_dx = dx.x;
+                float best_dy = dy.x;
+
+                float mag0 = dx.x * dx.x + dy.x * dy.x;
+                float mag1 = dx.y * dx.y + dy.y * dy.y;
+                if (mag0 < mag1) 
+                {
+                    best_dx = dx.y;
+                    best_dy = dy.y;
+                    mag0 = mag1;
+                }
+
+                mag1 = dx.z * dx.z + dy.z * dy.z;
+                if (mag0 < mag1)
+                {
+                    best_dx = dx.z;
+                    best_dy = dy.z;
+                    mag0 = mag1;
+                }
+
+                mag0 = ::sqrtf(mag0);
+
+                float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
+            }
+        }
 
-        float3 dy = make_float3(0.f, 0.f, 0.f);
 
-        if (blockIdx.y > 0 && blockIdx.y < height - 1)
+        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, 
+                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
         {
-            val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
-            a = make_float3(val.x, val.y, val.z);
+            const int nthreads = 256;
 
-            val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
-            b = make_float3(val.x, val.y, val.z);
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
             if (correct_gamma)
-                dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
             else
-                dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
-        }
+                compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
 
-        float best_dx = dx.x;
-        float best_dy = dy.x;
+            cudaSafeCall( cudaGetLastError() );
 
-        float mag0 = dx.x * dx.x + dy.x * dy.x;
-        float mag1 = dx.y * dx.y + dy.y * dy.y;
-        if (mag0 < mag1) 
-        {
-            best_dx = dx.y;
-            best_dy = dy.y;
-            mag0 = mag1;
+            cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        mag1 = dx.z * dx.z + dy.z * dy.z;
-        if (mag0 < mag1)
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
+                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
         {
-            best_dx = dx.z;
-            best_dy = dy.z;
-            mag0 = mag1;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[nthreads + 2];
+
+            if (x < width) 
+                sh_row[threadIdx.x + 1] = row[x]; 
+            else 
+                sh_row[threadIdx.x + 1] = row[width - 2];
+
+            if (threadIdx.x == 0)
+                sh_row[0] = row[::max(x - 1, 1)];
+
+            if (threadIdx.x == blockDim.x - 1)
+                sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
+
+            __syncthreads();
+            if (x < width)
+            {
+                float dx;
+
+                if (correct_gamma)
+                    dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
+                else
+                    dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
+
+                float dy = 0.f;
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
+                    float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
+                    if (correct_gamma)
+                        dy = ::sqrtf(a) - ::sqrtf(b);
+                    else
+                        dy = a - b;
+                }
+                float mag = ::sqrtf(dx * dx + dy * dy);
+
+                float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
+            }
         }
 
-        mag0 = ::sqrtf(mag0);
-
-        float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
-        int hidx = (int)::floorf(ang);
-        ang -= hidx;
-        hidx = (hidx + cnbins) % cnbins;
-
-        ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
-        ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
-    }
-}
-
-
-void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, 
-                            float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
-{
-    const int nthreads = 256;
-
-    dim3 bdim(nthreads, 1);
-    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
-
-    if (correct_gamma)
-        compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
-    else
-        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
-
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <int nthreads, int correct_gamma>
-__global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
-                                              float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
-
-    __shared__ float sh_row[nthreads + 2];
-
-    if (x < width) 
-        sh_row[threadIdx.x + 1] = row[x]; 
-    else 
-        sh_row[threadIdx.x + 1] = row[width - 2];
 
-    if (threadIdx.x == 0)
-        sh_row[0] = row[::max(x - 1, 1)];
-
-    if (threadIdx.x == blockDim.x - 1)
-        sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
-
-    __syncthreads();
-    if (x < width)
-    {
-        float dx;
+        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, 
+                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
+        {
+            const int nthreads = 256;
 
-        if (correct_gamma)
-            dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
-        else
-            dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
-        float dy = 0.f;
-        if (blockIdx.y > 0 && blockIdx.y < height - 1)
-        {
-            float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
-            float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
             if (correct_gamma)
-                dy = ::sqrtf(a) - ::sqrtf(b);
+                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
             else
-                dy = a - b;
-        }
-        float mag = ::sqrtf(dx * dx + dy * dy);
-
-        float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
-        int hidx = (int)::floorf(ang);
-        ang -= hidx;
-        hidx = (hidx + cnbins) % cnbins;
-
-        ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
-        ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
-    }
-}
-
-
-void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, 
-                            float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
-{
-    const int nthreads = 256;
-
-    dim3 bdim(nthreads, 1);
-    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
-
-    if (correct_gamma)
-        compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
-    else
-        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
 
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaGetLastError() );
 
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
 
-//-------------------------------------------------------------------
-// Resize
 
-texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
-texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
+        //-------------------------------------------------------------------
+        // Resize
 
-__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
+        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
 
-    if (x < dst.cols && y < dst.rows)
-        dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
-}
+        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-	{        
-		float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
-        dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
-	}
-}
-
-template<class T, class TEX> 
-static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
-{
-    tex.filterMode = cudaFilterModeLinear;
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+        }
 
-    size_t texOfs = 0;
-    int colOfs = 0;
+        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+	        {        
+		        float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
+	        }
+        }
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
-    cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+        template<class T, class TEX> 
+        static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
+        {
+            tex.filterMode = cudaFilterModeLinear;
 
-    if (texOfs != 0) 
-    {
-        colOfs = static_cast<int>( texOfs/sizeof(T) );
-        cudaSafeCall( cudaUnbindTexture(tex) );
-        cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-    }    
+            size_t texOfs = 0;
+            int colOfs = 0;
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-    
-	float sx = static_cast<float>(src.cols) / dst.cols;
-    float sy = static_cast<float>(src.rows) / dst.rows;
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
+            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
 
-    resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
-    cudaSafeCall( cudaGetLastError() );
+            if (texOfs != 0) 
+            {
+                colOfs = static_cast<int>( texOfs/sizeof(T) );
+                cudaSafeCall( cudaUnbindTexture(tex) );
+                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+            }    
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+            dim3 threads(32, 8);
+            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+            
+	        float sx = static_cast<float>(src.cols) / dst.cols;
+            float sy = static_cast<float>(src.rows) / dst.rows;
 
-    cudaSafeCall( cudaUnbindTexture(tex) );
-}
+            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
+            cudaSafeCall( cudaGetLastError() );
 
-void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
-void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+            cudaSafeCall( cudaDeviceSynchronize() );
 
-} // namespace hog 
+            cudaSafeCall( cudaUnbindTexture(tex) );
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+    } // namespace hog 
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 0169566..ece304d 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -46,990 +46,988 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-
-texture<uchar4, 2> tex_meanshift;
-
-__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, 
-                                size_t out_step, int cols, int rows, 
-                                int sp, int sr, int maxIter, float eps)
+namespace cv { namespace gpu { namespace device 
 {
-    int isr2 = sr*sr;
-    uchar4 c = tex2D(tex_meanshift, x0, y0 );
-
-    // iterate meanshift procedure
-    for( int iter = 0; iter < maxIter; iter++ )
+    namespace imgproc 
     {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-        float icount;
+        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
 
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0-sp;
-        int miny = y0-sp;
-        int maxx = x0+sp;
-        int maxy = y0+sp;
+        texture<uchar4, 2> tex_meanshift;
 
-        for( int y = miny; y <= maxy; y++)
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, 
+                                        size_t out_step, int cols, int rows, 
+                                        int sp, int sr, int maxIter, float eps)
         {
-            int rowCount = 0;
-            for( int x = minx; x <= maxx; x++ )
-            {                    
-                uchar4 t = tex2D( tex_meanshift, x, y );
-
-                int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
-                if( norm2 <= isr2 )
-                {
-                    s0 += t.x; s1 += t.y; s2 += t.z;
-                    sx += x; rowCount++;
-                }
-            }
-            count += rowCount;
-            sy += y*rowCount;
-        }
-
-        if( count == 0 )
-            break;
+            int isr2 = sr*sr;
+            uchar4 c = tex2D(tex_meanshift, x0, y0 );
 
-        icount = 1.f/count;
-        int x1 = __float2int_rz(sx*icount);
-        int y1 = __float2int_rz(sy*icount);
-        s0 = __float2int_rz(s0*icount);
-        s1 = __float2int_rz(s1*icount);
-        s2 = __float2int_rz(s2*icount);
+            // iterate meanshift procedure
+            for( int iter = 0; iter < maxIter; iter++ )
+            {
+                int count = 0;
+                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
+                float icount;
 
-        int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
+                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+                int minx = x0-sp;
+                int miny = y0-sp;
+                int maxx = x0+sp;
+                int maxy = y0+sp;
 
-        bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
+                for( int y = miny; y <= maxy; y++)
+                {
+                    int rowCount = 0;
+                    for( int x = minx; x <= maxx; x++ )
+                    {                    
+                        uchar4 t = tex2D( tex_meanshift, x, y );
+
+                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
+                        if( norm2 <= isr2 )
+                        {
+                            s0 += t.x; s1 += t.y; s2 += t.z;
+                            sx += x; rowCount++;
+                        }
+                    }
+                    count += rowCount;
+                    sy += y*rowCount;
+                }
 
-        x0 = x1; y0 = y1;
-        c.x = s0; c.y = s1; c.z = s2;
+                if( count == 0 )
+                    break;
 
-        if( stopFlag )
-            break;
-    }
+                icount = 1.f/count;
+                int x1 = __float2int_rz(sx*icount);
+                int y1 = __float2int_rz(sy*icount);
+                s0 = __float2int_rz(s0*icount);
+                s1 = __float2int_rz(s1*icount);
+                s2 = __float2int_rz(s2*icount);
 
-    int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
-    *(uchar4*)(out + base) = c;
+                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
 
-    return make_short2((short)x0, (short)y0);
-}
+                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
 
-__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
-{
-    int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-    int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+                x0 = x1; y0 = y1;
+                c.x = s0; c.y = s1; c.z = s2;
 
-    if( x0 < cols && y0 < rows )
-        do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
-}
+                if( stopFlag )
+                    break;
+            }
 
-__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
-                                     unsigned char* outsp, size_t outspstep, 
-                                     int cols, int rows, 
-                                     int sp, int sr, int maxIter, float eps)
-{
-    int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-    int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
+            *(uchar4*)(out + base) = c;
 
-    if( x0 < cols && y0 < rows )
-    {            
-        int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-        *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
-    }
-}
+            return make_short2((short)x0, (short)y0);
+        }
 
-void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
-{
-    dim3 grid(1, 1, 1);
-    dim3 threads(32, 8, 1);
-    grid.x = divUp(src.cols, threads.x);
-    grid.y = divUp(src.rows, threads.y);
+        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-    cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+            if( x0 < cols && y0 < rows )
+                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+        }
 
-    meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
-    cudaSafeCall( cudaGetLastError() );
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
+                                             unsigned char* outsp, size_t outspstep, 
+                                             int cols, int rows, 
+                                             int sp, int sr, int maxIter, float eps)
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            if( x0 < cols && y0 < rows )
+            {            
+                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
+                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+            }
+        }
 
-    //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
-}
+        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
 
-void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
-{
-    dim3 grid(1, 1, 1);
-    dim3 threads(32, 8, 1);
-    grid.x = divUp(src.cols, threads.x);
-    grid.y = divUp(src.rows, threads.y);
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-    cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
 
-    meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        }
 
-    //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
-}
+        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
 
-/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
 
-template <typename T>
-__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
-{        
-    unsigned int H = ((ndisp-d) * 240)/ndisp;
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
 
-    unsigned int hi = (H/60) % 6;
-    float f = H/60.f - H/60;
-    float p = V * (1 - S);
-    float q = V * (1 - f * S);
-    float t = V * (1 - (1 - f) * S);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    float3 res;
-    
-    if (hi == 0) //R = V,	G = t,	B = p
-    {
-        res.x = p;
-        res.y = t;
-        res.z = V;
-    }
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        }
 
-    if (hi == 1) // R = q,	G = V,	B = p
-    {
-        res.x = p;
-        res.y = V;
-        res.z = q;
-    }        
-    
-    if (hi == 2) // R = p,	G = V,	B = t
-    {
-        res.x = t;
-        res.y = V;
-        res.z = p;
-    }
-        
-    if (hi == 3) // R = p,	G = q,	B = V
-    {
-        res.x = V;
-        res.y = q;
-        res.z = p;
-    }
+        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
 
-    if (hi == 4) // R = t,	G = p,	B = V
-    {
-        res.x = V;
-        res.y = p;
-        res.z = t;
-    }
+        template <typename T>
+        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
+        {        
+            unsigned int H = ((ndisp-d) * 240)/ndisp;
 
-    if (hi == 5) // R = V,	G = p,	B = q
-    {
-        res.x = q;
-        res.y = p;
-        res.z = V;
-    }
-    const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
-    const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
-    const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
-    const unsigned int a = 255U;
-
-    return (a << 24) + (r << 16) + (g << 8) + b;    
-} 
-
-__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-{
-    const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            unsigned int hi = (H/60) % 6;
+            float f = H/60.f - H/60;
+            float p = V * (1 - S);
+            float q = V * (1 - f * S);
+            float t = V * (1 - (1 - f) * S);
 
-    if(x < width && y < height) 
-    {
-        uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
+            float3 res;
+            
+            if (hi == 0) //R = V,	G = t,	B = p
+            {
+                res.x = p;
+                res.y = t;
+                res.z = V;
+            }
 
-        uint4 res;
-        res.x = cvtPixel(d4.x, ndisp);
-        res.y = cvtPixel(d4.y, ndisp);
-        res.z = cvtPixel(d4.z, ndisp);
-        res.w = cvtPixel(d4.w, ndisp);
+            if (hi == 1) // R = q,	G = V,	B = p
+            {
+                res.x = p;
+                res.y = V;
+                res.z = q;
+            }        
+            
+            if (hi == 2) // R = p,	G = V,	B = t
+            {
+                res.x = t;
+                res.y = V;
+                res.z = p;
+            }
                 
-        uint4* line = (uint4*)(out_image + y * out_step);
-        line[x >> 2] = res;
-    }
-}
-
-__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-{
-    const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (hi == 3) // R = p,	G = q,	B = V
+            {
+                res.x = V;
+                res.y = q;
+                res.z = p;
+            }
 
-    if(x < width && y < height) 
-    {
-        short2 d2 = *(short2*)(disp + y * disp_step + x);
+            if (hi == 4) // R = t,	G = p,	B = V
+            {
+                res.x = V;
+                res.y = p;
+                res.z = t;
+            }
 
-        uint2 res;
-        res.x = cvtPixel(d2.x, ndisp);            
-        res.y = cvtPixel(d2.y, ndisp);
+            if (hi == 5) // R = V,	G = p,	B = q
+            {
+                res.x = q;
+                res.y = p;
+                res.z = V;
+            }
+            const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
+            const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
+            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
+            const unsigned int a = 255U;
 
-        uint2* line = (uint2*)(out_image + y * out_step);
-        line[x >> 1] = res;
-    }
-}
+            return (a << 24) + (r << 16) + (g << 8) + b;    
+        } 
 
+        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
-{
-    dim3 threads(16, 16, 1);
-    dim3 grid(1, 1, 1);
-    grid.x = divUp(src.cols, threads.x << 2);
-    grid.y = divUp(src.rows, threads.y);
-     
-    drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() ); 
-}
-
-void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-    grid.x = divUp(src.cols, threads.x << 1);
-    grid.y = divUp(src.rows, threads.y);
-     
-    drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
-    cudaSafeCall( cudaGetLastError() );
-    
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
-
-__constant__ float cq[16];
-
-template <typename T>
-__global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
-{        
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (y < rows && x < cols)
-    {
+            if(x < width && y < height) 
+            {
+                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
+
+                uint4 res;
+                res.x = cvtPixel(d4.x, ndisp);
+                res.y = cvtPixel(d4.y, ndisp);
+                res.z = cvtPixel(d4.z, ndisp);
+                res.w = cvtPixel(d4.w, ndisp);
+                        
+                uint4* line = (uint4*)(out_image + y * out_step);
+                line[x >> 2] = res;
+            }
+        }
 
-        float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
-        float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];
+        __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        qx += x * cq[0]; 
-        qy += x * cq[4];
-        qz += x * cq[8];
-        qw += x * cq[12];
+            if(x < width && y < height) 
+            {
+                short2 d2 = *(short2*)(disp + y * disp_step + x);
 
-        T d = *(disp + disp_step * y + x);
+                uint2 res;
+                res.x = cvtPixel(d2.x, ndisp);            
+                res.y = cvtPixel(d2.y, ndisp);
 
-        float iW = 1.f / (qw + cq[14] * d);
-        float4 v;
-        v.x = (qx + cq[2] * d) * iW;
-        v.y = (qy + cq[6] * d) * iW;
-        v.z = (qz + cq[10] * d) * iW;
-        v.w = 1.f;
+                uint2* line = (uint2*)(out_image + y * out_step);
+                line[x >> 1] = res;
+            }
+        }
 
-        *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
-    }
-}
 
-template <typename T>
-inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-    grid.x = divUp(disp.cols, threads.x);
-    grid.y = divUp(disp.rows, threads.y);
+        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x << 2);
+            grid.y = divUp(src.rows, threads.y);
+             
+            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() ); 
+        }
 
-    cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
+        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x << 1);
+            grid.y = divUp(src.rows, threads.y);
+             
+            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
+            cudaSafeCall( cudaGetLastError() );
+            
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
-    cudaSafeCall( cudaGetLastError() );
+        /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        __constant__ float cq[16];
 
-void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
-{
-    reprojectImageTo3D_caller(disp, xyzw, q, stream);
-}
+        template <typename T>
+        __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
+        {        
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
-{
-    reprojectImageTo3D_caller(disp, xyzw, q, stream);
-}
+            if (y < rows && x < cols)
+            {
 
-//////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////
+                float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
+                float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];
 
-__global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, 
-                                      const PtrStepf Dy, PtrStepf dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+                qx += x * cq[0]; 
+                qy += x * cq[4];
+                qz += x * cq[8];
+                qw += x * cq[12];
 
-    if (x < cols && y < rows)
-    {            
-        float dx = Dx.ptr(y)[x];
-        float dy = Dy.ptr(y)[x];
+                T d = *(disp + disp_step * y + x);
 
-        dst.ptr(y)[x] = dx * dx;
-        dst.ptr(y + rows)[x] = dx * dy;
-        dst.ptr(y + (rows << 1))[x] = dy * dy;
-    }
-}
+                float iW = 1.f / (qw + cq[14] * d);
+                float4 v;
+                v.x = (qx + cq[2] * d) * iW;
+                v.y = (qy + cq[6] * d) * iW;
+                v.z = (qz + cq[10] * d) * iW;
+                v.w = 1.f;
 
-void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
+                *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
+            }
+        }
 
-    extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
 
-/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
+            reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
+            cudaSafeCall( cudaGetLastError() );
 
-texture<float, 2> harrisDxTex;
-texture<float, 2> harrisDyTex;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
-                                    PtrStepb dst)
-{
-    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+        void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
+        {
+            reprojectImageTo3D_caller(disp, xyzw, q, stream);
+        }
 
-    if (x < cols && y < rows)
-    {
-        float a = 0.f;
-        float b = 0.f;
-        float c = 0.f;
+        void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
+        {
+            reprojectImageTo3D_caller(disp, xyzw, q, stream);
+        }
 
-        const int ibegin = y - (block_size / 2);
-        const int jbegin = x - (block_size / 2);
-        const int iend = ibegin + block_size;
-        const int jend = jbegin + block_size;
+        //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////
 
-        for (int i = ibegin; i < iend; ++i)
+        __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, 
+                                              const PtrStepf Dy, PtrStepf dst)
         {
-            for (int j = jbegin; j < jend; ++j)
-            {
-                float dx = tex2D(harrisDxTex, j, i);
-                float dy = tex2D(harrisDyTex, j, i);
-                a += dx * dx;
-                b += dx * dy;
-                c += dy * dy;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < cols && y < rows)
+            {            
+                float dx = Dx.ptr(y)[x];
+                float dy = Dy.ptr(y)[x];
+
+                dst.ptr(y)[x] = dx * dx;
+                dst.ptr(y + rows)[x] = dx * dy;
+                dst.ptr(y + (rows << 1))[x] = dy * dy;
             }
         }
 
-        ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
-    }
-}
+        void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
 
-template <typename BR, typename BC>
-__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
-                                    PtrStepb dst, BR border_row, BC border_col)
-{
-    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+            extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (x < cols && y < rows)
-    {
-        float a = 0.f;
-        float b = 0.f;
-        float c = 0.f;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
 
-        const int ibegin = y - (block_size / 2);
-        const int jbegin = x - (block_size / 2);
-        const int iend = ibegin + block_size;
-        const int jend = jbegin + block_size;
+        texture<float, 2> harrisDxTex;
+        texture<float, 2> harrisDyTex;
 
-        for (int i = ibegin; i < iend; ++i)
+        __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
+                                            PtrStepb dst)
         {
-            int y = border_col.idx_row(i);
-            for (int j = jbegin; j < jend; ++j)
+            const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < cols && y < rows)
             {
-                int x = border_row.idx_col(j);
-                float dx = tex2D(harrisDxTex, x, y);
-                float dy = tex2D(harrisDyTex, x, y);
-                a += dx * dx;
-                b += dx * dy;
-                c += dy * dy;
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(harrisDxTex, j, i);
+                        float dy = tex2D(harrisDyTex, j, i);
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
             }
         }
 
-        ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
-    }
-}
+        template <typename BR, typename BC>
+        __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
+                                            PtrStepb dst, BR border_row, BC border_col)
+        {
+            const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, 
-                         int border_type, cudaStream_t stream)
-{
-    const int rows = Dx.rows;
-    const int cols = Dx.cols;
+            if (x < cols && y < rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
-    cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
-    cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
-    harrisDxTex.filterMode = cudaFilterModePoint;
-    harrisDyTex.filterMode = cudaFilterModePoint;
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    int y = border_col.idx_row(i);
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        int x = border_row.idx_col(j);
+                        float dx = tex2D(harrisDxTex, x, y);
+                        float dy = tex2D(harrisDyTex, x, y);
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
 
-    switch (border_type) 
-    {
-    case BORDER_REFLECT101_GPU:
-        cornerHarris_kernel<<<grid, threads, 0, stream>>>(
-                cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
-        break;
-    case BORDER_REPLICATE_GPU:
-        harrisDxTex.addressMode[0] = cudaAddressModeClamp;
-        harrisDxTex.addressMode[1] = cudaAddressModeClamp;
-        harrisDyTex.addressMode[0] = cudaAddressModeClamp;
-        harrisDyTex.addressMode[1] = cudaAddressModeClamp;
+                ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
 
-        cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
-        break;
-    }
+        void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, 
+                                 int border_type, cudaStream_t stream)
+        {
+            const int rows = Dx.rows;
+            const int cols = Dx.cols;
 
-    cudaSafeCall( cudaGetLastError() );
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
+            cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
+            cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
+            harrisDxTex.filterMode = cudaFilterModePoint;
+            harrisDyTex.filterMode = cudaFilterModePoint;
 
-    //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
-    //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
-}
+            switch (border_type) 
+            {
+            case BORDER_REFLECT101_GPU:
+                cornerHarris_kernel<<<grid, threads, 0, stream>>>(
+                        cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
+                break;
+            case BORDER_REPLICATE_GPU:
+                harrisDxTex.addressMode[0] = cudaAddressModeClamp;
+                harrisDxTex.addressMode[1] = cudaAddressModeClamp;
+                harrisDyTex.addressMode[0] = cudaAddressModeClamp;
+                harrisDyTex.addressMode[1] = cudaAddressModeClamp;
+
+                cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
+                break;
+            }
 
-/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
+            cudaSafeCall( cudaGetLastError() );
 
-texture<float, 2> minEigenValDxTex;
-texture<float, 2> minEigenValDyTex;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, 
-                                         PtrStepb dst)
-{
-    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+            //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
+            //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
+        }
 
-    if (x < cols && y < rows)
-    {
-        float a = 0.f;
-        float b = 0.f;
-        float c = 0.f;
+        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
 
-        const int ibegin = y - (block_size / 2);
-        const int jbegin = x - (block_size / 2);
-        const int iend = ibegin + block_size;
-        const int jend = jbegin + block_size;
+        texture<float, 2> minEigenValDxTex;
+        texture<float, 2> minEigenValDyTex;
 
-        for (int i = ibegin; i < iend; ++i)
+        __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, 
+                                                 PtrStepb dst)
         {
-            for (int j = jbegin; j < jend; ++j)
+            const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < cols && y < rows)
             {
-                float dx = tex2D(minEigenValDxTex, j, i);
-                float dy = tex2D(minEigenValDyTex, j, i);
-                a += dx * dx;
-                b += dx * dy;
-                c += dy * dy;
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(minEigenValDxTex, j, i);
+                        float dy = tex2D(minEigenValDyTex, j, i);
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+                ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
             }
         }
 
-        a *= 0.5f;
-        c *= 0.5f;
-        ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-    }
-}
 
+        template <typename BR, typename BC>
+        __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, 
+                                                 PtrStepb dst, BR border_row, BC border_col)
+        {
+            const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-template <typename BR, typename BC>
-__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, 
-                                         PtrStepb dst, BR border_row, BC border_col)
-{
-    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (x < cols && y < rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
 
-    if (x < cols && y < rows)
-    {
-        float a = 0.f;
-        float b = 0.f;
-        float c = 0.f;
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
 
-        const int ibegin = y - (block_size / 2);
-        const int jbegin = x - (block_size / 2);
-        const int iend = ibegin + block_size;
-        const int jend = jbegin + block_size;
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    int y = border_col.idx_row(i);
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        int x = border_row.idx_col(j);
+                        float dx = tex2D(minEigenValDxTex, x, y);
+                        float dy = tex2D(minEigenValDyTex, x, y);
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
 
-        for (int i = ibegin; i < iend; ++i)
-        {
-            int y = border_col.idx_row(i);
-            for (int j = jbegin; j < jend; ++j)
-            {
-                int x = border_row.idx_col(j);
-                float dx = tex2D(minEigenValDxTex, x, y);
-                float dy = tex2D(minEigenValDyTex, x, y);
-                a += dx * dx;
-                b += dx * dy;
-                c += dy * dy;
+                a *= 0.5f;
+                c *= 0.5f;
+                ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
             }
         }
 
-        a *= 0.5f;
-        c *= 0.5f;
-        ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-    }
-}
+        void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
+                                      int border_type, cudaStream_t stream)
+        {
+            const int rows = Dx.rows;
+            const int cols = Dx.cols;
 
-void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
-                              int border_type, cudaStream_t stream)
-{
-    const int rows = Dx.rows;
-    const int cols = Dx.cols;
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
+            cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
+            cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
+            minEigenValDxTex.filterMode = cudaFilterModePoint;
+            minEigenValDyTex.filterMode = cudaFilterModePoint;
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
-    cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
-    cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
-    minEigenValDxTex.filterMode = cudaFilterModePoint;
-    minEigenValDyTex.filterMode = cudaFilterModePoint;
+            switch (border_type)
+            {
+            case BORDER_REFLECT101_GPU:
+                cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
+                        cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
+                break;
+            case BORDER_REPLICATE_GPU:
+                minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
+                minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
+                minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
+                minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
+
+                cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
+                break;
+            }
 
-    switch (border_type)
-    {
-    case BORDER_REFLECT101_GPU:
-        cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
-                cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
-        break;
-    case BORDER_REPLICATE_GPU:
-        minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
-        minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
-        minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
-        minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
+            cudaSafeCall( cudaGetLastError() );
 
-        cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
-        break;
-    }
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
 
-    cudaSafeCall( cudaGetLastError() );
+            //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
+            //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
+        }
 
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
+        ////////////////////////////// Column Sum //////////////////////////////////////
 
-    //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
-    //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
-}
+        __global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
 
-////////////////////////////// Column Sum //////////////////////////////////////
+            if (x < cols)
+            {
+                const unsigned char* src_data = src.data + x * sizeof(float);
+                unsigned char* dst_data = dst.data + x * sizeof(float);
 
-__global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
+                float sum = 0.f;
+                for (int y = 0; y < rows; ++y)
+                {
+                    sum += *(const float*)src_data;
+                    *(float*)dst_data = sum;
+                    src_data += src.step;
+                    dst_data += dst.step;
+                }
+            }
+        }
 
-    if (x < cols)
-    {
-        const unsigned char* src_data = src.data + x * sizeof(float);
-        unsigned char* dst_data = dst.data + x * sizeof(float);
 
-        float sum = 0.f;
-        for (int y = 0; y < rows; ++y)
+        void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)
         {
-            sum += *(const float*)src_data;
-            *(float*)dst_data = sum;
-            src_data += src.step;
-            dst_data += dst.step;
-        }
-    }
-}
+            dim3 threads(256);
+            dim3 grid(divUp(src.cols, threads.x));
 
+            column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
+            cudaSafeCall( cudaGetLastError() );
 
-void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)
-{
-    dim3 threads(256);
-    dim3 grid(divUp(src.cols, threads.x));
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
-    cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //////////////////////////////////////////////////////////////////////////
+        // mulSpectrums
 
+        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
 
-//////////////////////////////////////////////////////////////////////////
-// mulSpectrums
+            if (x < c.cols && y < c.rows) 
+            {
+                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+            }
+        }
 
-__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;    
 
-    if (x < c.cols && y < c.rows) 
-    {
-        c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-    }
-}
+        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        {
+            dim3 threads(256);
+            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
+            mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
+            cudaSafeCall( cudaGetLastError() );
 
-void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
-{
-    dim3 threads(256);
-    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
-    cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //////////////////////////////////////////////////////////////////////////
+        // mulSpectrums_CONJ
 
+        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
 
-//////////////////////////////////////////////////////////////////////////
-// mulSpectrums_CONJ
+            if (x < c.cols && y < c.rows) 
+            {
+                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+            }
+        }
 
-__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;    
 
-    if (x < c.cols && y < c.rows) 
-    {
-        c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-    }
-}
+        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        {
+            dim3 threads(256);
+            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
+            mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
+            cudaSafeCall( cudaGetLastError() );
 
-void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
-{
-    dim3 threads(256);
-    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
-    cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //////////////////////////////////////////////////////////////////////////
+        // mulAndScaleSpectrums
 
+        __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-//////////////////////////////////////////////////////////////////////////
-// mulAndScaleSpectrums
+            if (x < c.cols && y < c.rows) 
+            {
+                cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+            }
+        }
 
-__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (x < c.cols && y < c.rows) 
-    {
-        cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-        c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-    }
-}
+        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        {
+            dim3 threads(256);
+            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
+            mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
+            cudaSafeCall( cudaGetLastError() );
 
-void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
-{
-    dim3 threads(256);
-    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-
-    mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
 
+        //////////////////////////////////////////////////////////////////////////
+        // mulAndScaleSpectrums_CONJ
 
-//////////////////////////////////////////////////////////////////////////
-// mulAndScaleSpectrums_CONJ
+        __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (x < c.cols && y < c.rows) 
+            {
+                cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+            }
+        }
 
-    if (x < c.cols && y < c.rows) 
-    {
-        cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-        c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-    }
-}
 
+        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
+        {
+            dim3 threads(256);
+            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
 
-void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
-{
-    dim3 threads(256);
-    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+            mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
+            cudaSafeCall( cudaGetLastError() );
 
-    mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }    
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}    
+        //////////////////////////////////////////////////////////////////////////
+        // buildWarpMaps
 
-//////////////////////////////////////////////////////////////////////////
-// buildWarpMaps
+        // TODO use intrinsics like __sinf and so on
 
-// TODO use intrinsics like __sinf and so on
+        namespace build_warp_maps
+        {
 
-namespace build_warp_maps
-{
+            __constant__ float ck_rinv[9];
+            __constant__ float cr_kinv[9];
+            __constant__ float ct[3];
+            __constant__ float cscale;
+        }
 
-    __constant__ float ck_rinv[9];
-    __constant__ float cr_kinv[9];
-    __constant__ float ct[3];
-    __constant__ float cscale;
-}
 
+        class PlaneMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
 
-class PlaneMapper
-{
-public:
-    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-    {
-        using namespace build_warp_maps;
+                float x_ = u / cscale - ct[0];
+                float y_ = v / cscale - ct[1];
 
-        float x_ = u / cscale - ct[0];
-        float y_ = v / cscale - ct[1];
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
 
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+                x /= z;
+                y /= z;
+            }
+        };
 
-        x /= z;
-        y /= z;
-    }
-};
 
+        class CylindricalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
 
-class CylindricalMapper
-{
-public:
-    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-    {
-        using namespace build_warp_maps;
+                u /= cscale;
+                float x_ = ::sinf(u);
+                float y_ = v / cscale;
+                float z_ = ::cosf(u);
 
-        u /= cscale;
-        float x_ = ::sinf(u);
-        float y_ = v / cscale;
-        float z_ = ::cosf(u);
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
 
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
 
-        if (z > 0) { x /= z; y /= z; }
-        else x = y = -1;
-    }
-};
 
+        class SphericalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
 
-class SphericalMapper
-{
-public:
-    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-    {
-        using namespace build_warp_maps;
+                v /= cscale;
+                u /= cscale;
 
-        v /= cscale;
-        u /= cscale;
+                float sinv = ::sinf(v);
+                float x_ = sinv * ::sinf(u);
+                float y_ = -::cosf(v);
+                float z_ = sinv * ::cosf(u);
 
-        float sinv = ::sinf(v);
-        float x_ = sinv * ::sinf(u);
-        float y_ = -::cosf(v);
-        float z_ = sinv * ::cosf(u);
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
 
-        float z;
-        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
 
-        if (z > 0) { x /= z; y /= z; }
-        else x = y = -1;
-    }
-};
 
+        template <typename Mapper>
+        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
+                                            PtrStepf map_x, PtrStepf map_y)
+        {
+            int du = blockIdx.x * blockDim.x + threadIdx.x;
+            int dv = blockIdx.y * blockDim.y + threadIdx.y;
+            if (du < cols && dv < rows)
+            {
+                float u = tl_u + du;
+                float v = tl_v + dv;
+                float x, y;
+                Mapper::mapBackward(u, v, x, y);
+                map_x.ptr(dv)[du] = x;
+                map_y.ptr(dv)[du] = y;
+            }
+        }
 
-template <typename Mapper>
-__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
-                                    PtrStepf map_x, PtrStepf map_y)
-{
-    int du = blockIdx.x * blockDim.x + threadIdx.x;
-    int dv = blockIdx.y * blockDim.y + threadIdx.y;
-    if (du < cols && dv < rows)
-    {
-        float u = tl_u + du;
-        float v = tl_v + dv;
-        float x, y;
-        Mapper::mapBackward(u, v, x, y);
-        map_x.ptr(dv)[du] = x;
-        map_y.ptr(dv)[du] = y;
-    }
-}
-
-
-void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                        const float k_rinv[9], const float r_kinv[9], const float t[3], 
-                        float scale, cudaStream_t stream)
-{
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    int cols = map_x.cols;
-    int rows = map_x.rows;
+        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3], 
+                                float scale, cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            int cols = map_x.cols;
+            int rows = map_x.rows;
 
-    buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-    cudaSafeCall(cudaGetLastError());
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
+            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
 
-void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                              const float k_rinv[9], const float r_kinv[9], float scale,
-                              cudaStream_t stream)
-{
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    int cols = map_x.cols;
-    int rows = map_x.rows;
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            int cols = map_x.cols;
+            int rows = map_x.rows;
 
-    buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-    cudaSafeCall(cudaGetLastError());
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
+            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
 
-void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                            const float k_rinv[9], const float r_kinv[9], float scale,
-                            cudaStream_t stream)
-{
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    int cols = map_x.cols;
-    int rows = map_x.rows;
+        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
 
-    dim3 threads(32, 8);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            int cols = map_x.cols;
+            int rows = map_x.rows;
 
-    buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-    cudaSafeCall(cudaGetLastError());
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
+            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
 
-//////////////////////////////////////////////////////////////////////////
-// convolve
 
-#define CONVOLVE_MAX_KERNEL_SIZE 17
+        //////////////////////////////////////////////////////////////////////////
+        // convolve
 
-__constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
+        #define CONVOLVE_MAX_KERNEL_SIZE 17
 
-__global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
-{
-    __shared__ float smem[16 + 2 * 8][16 + 2 * 8];
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    // x | x 0 | 0
-    // -----------
-    // x | x 0 | 0
-    // 0 | 0 0 | 0
-    // -----------
-    // 0 | 0 0 | 0
-    smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
-
-    // 0 | 0 x | x
-    // -----------
-    // 0 | 0 x | x
-    // 0 | 0 0 | 0
-    // -----------
-    // 0 | 0 0 | 0
-    smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];
-
-    // 0 | 0 0 | 0
-    // -----------
-    // 0 | 0 0 | 0
-    // x | x 0 | 0
-    // -----------
-    // x | x 0 | 0
-    smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
-
-    // 0 | 0 0 | 0
-    // -----------
-    // 0 | 0 0 | 0
-    // 0 | 0 x | x
-    // -----------
-    // 0 | 0 x | x
-    smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];
-
-    __syncthreads();
-
-    if (x < src.cols && y < src.rows)
-    {
-        float res = 0;
+        __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
 
-        for (int i = 0; i < kHeight; ++i)
+        __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
         {
-            for (int j = 0; j < kWidth; ++j)
+            __shared__ float smem[16 + 2 * 8][16 + 2 * 8];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
+
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+            smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+            smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];
+
+            __syncthreads();
+
+            if (x < src.cols && y < src.rows)
             {
-                res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];
-            }
-        }
-
-        dst.ptr(y)[x] = res;
-    }
-}
+                float res = 0;
 
-void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
-{
-    cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+                for (int i = 0; i < kHeight; ++i)
+                {
+                    for (int j = 0; j < kWidth; ++j)
+                    {
+                        res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];
+                    }
+                }
 
-    const dim3 block(16, 16);
-    const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+                dst.ptr(y)[x] = res;
+            }
+        }
 
-    convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
-    cudaSafeCall(cudaGetLastError());
+        void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
 
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-} // namespace imgproc
+            convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
+            cudaSafeCall(cudaGetLastError());
 
-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device {
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp
index 2b52dbb..c64fb13 100644
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -50,7 +50,7 @@
 #include "safe_call.hpp"
 
 #ifndef CV_PI
-#define CV_PI   3.1415926535897932384626433832795f
+#define CV_PI   3.1415926535897932384626433832795
 #endif
 
 #ifndef CV_PI_F
@@ -61,27 +61,21 @@
   #endif
 #endif
 
-#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { 
-#define END_OPENCV_DEVICE_NAMESPACE   }}}
-#define OPENCV_DEVICE_NAMESPACE       ::cv::gpu::device
-#define OPENCV_DEVICE_NAMESPACE_      ::cv::gpu::device:: 
-
 #ifdef __CUDACC__
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef signed char schar;
-typedef unsigned int uint;
-
-template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-}
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
+    typedef signed char schar;
+    typedef unsigned int uint;
 
-END_OPENCV_DEVICE_NAMESPACE
+    template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+    {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+    }
+}}}
 
 #endif
 
@@ -102,87 +96,6 @@ namespace cv { namespace gpu
 
     static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
 
-    /*template<class T> static inline void uploadConstant(const char* name, const T& value) 
-    { 
-        cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
-    }
-
-    template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
-    {
-        cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
-    }   */     
-
-    //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
-    //{            
-    //    //!!!! const_cast is disabled!
-    //    //!!!! Please use constructor of 'class texture'  instead.
-    //
-    //    //textureReference* tex; 
-    //    //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); 
-    //    //tex->normalized = normalized;
-    //    //tex->filterMode = filterMode;
-    //    //tex->addressMode[0] = addrMode;
-    //    //tex->addressMode[1] = addrMode;
-    //    
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //
-    //    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //}
-
-    //static inline void unbindTexture(const char *name)
-    //{
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //    cudaSafeCall( cudaUnbindTexture(tex) );
-    //}
-
-    
-
-    //class TextureBinder
-    //{
-    //public:
-    //    TextureBinder() : tex_(0) {}
-    //    template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex, img);
-    //    }
-    //    template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex_name, img);
-    //    }
-    //    ~TextureBinder() { unbind(); }
-    //
-    //    template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
-    //    {
-    //        unbind();
-    //
-    //        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //
-    //        tex_ = tex;
-    //    }
-    //    template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
-    //    {
-    //        const textureReference* tex; 
-    //        cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
-    //        bind(tex, img);
-    //    }
-    //
-    //    void unbind()
-    //    {
-    //        if (tex_)
-    //        {
-    //            cudaUnbindTexture(tex_);
-    //            tex_ = 0;
-    //        }
-    //    }
-    //
-    //private:
-    //    const textureReference* tex_;
-    //};
-
     class NppStreamHandler
     {
     public:
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 1fa571a..afa1d1f 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -43,869 +43,867 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    namespace match_template 
+    {
+        __device__ __forceinline__ float sum(float v) { return v; }
+        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+        __device__ __forceinline__ float first(float v) { return v; }
+        __device__ __forceinline__ float first(float2 v) { return v.x; }
+        __device__ __forceinline__ float first(float3 v) { return v.x; }
+        __device__ __forceinline__ float first(float4 v) { return v.x; }
+
+        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
+        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
+        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_CCORR
+
+        template <typename T, int cn> 
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
 
-namespace match_template {
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-__device__ __forceinline__ float sum(float v) { return v; }
-__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
-__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
-__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
 
-__device__ __forceinline__ float first(float v) { return v; }
-__device__ __forceinline__ float first(float2 v) { return v.x; }
-__device__ __forceinline__ float first(float3 v) { return v.x; }
-__device__ __forceinline__ float first(float4 v) { return v.x; }
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
+                }
 
-__device__ __forceinline__ float mul(float a, float b) { return a * b; }
-__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+                result.ptr(y)[x] = sum(res);
+            }
+        }
 
-__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
-__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+        template <typename T, int cn>
+        void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-__device__ __forceinline__ float sub(float a, float b) { return a - b; }
-__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
 
-__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
-__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-//////////////////////////////////////////////////////////////////////
-// Naive_CCORR
+        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-template <typename T, int cn> 
-__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
-{
-    typedef typename TypeVec<T, cn>::vec_type Type;
-    typedef typename TypeVec<float, cn>::vec_type Typef;
+            static const caller_t callers[] = 
+            {
+                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+            };
 
-    int x = blockDim.x * blockIdx.x + threadIdx.x;
-    int y = blockDim.y * blockIdx.y + threadIdx.y;
+            callers[cn](image, templ, result, stream);
+        }
 
-    if (x < result.cols && y < result.rows)
-    {
-        Typef res = VecTraits<Typef>::all(0);
 
-        for (int i = 0; i < h; ++i)
+        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
         {
-            const Type* image_ptr = (const Type*)image.ptr(y + i);
-            const Type* templ_ptr = (const Type*)templ.ptr(i);
-            for (int j = 0; j < w; ++j)
-                res = res + mul(image_ptr[x + j], templ_ptr[j]);
+            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+
+            static const caller_t callers[] = 
+            {
+                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
         }
 
-        result.ptr(y)[x] = sum(res);
-    }
-}
+        //////////////////////////////////////////////////////////////////////
+        // Naive_SQDIFF
 
-template <typename T, int cn>
-void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
-{
-    const dim3 threads(32, 8);
-    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
 
-    matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-    cudaSafeCall( cudaGetLastError() );
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+                Typef delta;
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                    {
+                        delta = sub(image_ptr[x + j], templ_ptr[j]);
+                        res = res + delta * delta;
+                    }
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
 
-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+        template <typename T, int cn>
+        void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    static const caller_t callers[] = 
-    {
-        0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
-    };
+            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
 
-    callers[cn](image, templ, result, stream);
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
+        void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+            static const caller_t callers[] = 
+            {
+                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+            };
 
-    static const caller_t callers[] = 
-    {
-        0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
-    };
+            callers[cn](image, templ, result, stream);
+        }
 
-    callers[cn](image, templ, result, stream);
-}
+        void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
 
-//////////////////////////////////////////////////////////////////////
-// Naive_SQDIFF
+            static const caller_t callers[] = 
+            {
+                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+            };
 
-template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
-{
-    typedef typename TypeVec<T, cn>::vec_type Type;
-    typedef typename TypeVec<float, cn>::vec_type Typef;
+            callers[cn](image, templ, result, stream);
+        }
 
-    int x = blockDim.x * blockIdx.x + threadIdx.x;
-    int y = blockDim.y * blockIdx.y + threadIdx.y;
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF
 
-    if (x < result.cols && y < result.rows)
-    {
-        Typef res = VecTraits<Typef>::all(0);
-        Typef delta;
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
+            }
+        }
 
-        for (int i = 0; i < h; ++i)
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
         {
-            const Type* image_ptr = (const Type*)image.ptr(y + i);
-            const Type* templ_ptr = (const Type*)templ.ptr(i);
-            for (int j = 0; j < w; ++j)
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
+                                             cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+
+            static const caller_t callers[] = 
             {
-                delta = sub(image_ptr[x + j], templ_ptr[j]);
-                res = res + delta * delta;
+                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF_NORMED
+
+        // normAcc* are accurate normalization routines which make GPU matchTemplate
+        // consistent with CPU one
+
+        __device__ float normAcc(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 0;
+        }
+
+
+        __device__ float normAcc_SQDIFF(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 1;
+        }
+
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                                  sqrtf(image_sqsum_ * templ_sqsum));
             }
         }
 
-        result.ptr(y)[x] = sum(res);
-    }
-}
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                                    DevMem2Df result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-template <typename T, int cn>
-void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
-{
-    const dim3 threads(32, 8);
-    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
 
-    matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
 
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                                    DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            static const caller_t callers[] = 
+            {
+                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+            };
 
-    static const caller_t callers[] = 
-    {
-        0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
-    };
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
 
-    callers[cn](image, templ, result, stream);
-}
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF
 
-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    static const caller_t callers[] = 
-    {
-        0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
-    };
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
+            }
+        }
 
-    callers[cn](image, templ, result, stream);
-}
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-//////////////////////////////////////////////////////////////////////
-// Prepared_SQDIFF
+            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
+            cudaSafeCall( cudaGetLastError() );
 
-template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sqsum_ = (float)(
-                (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
-    }
-}
-
-template <int cn>
-void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
-{
-    const dim3 threads(32, 8);
-    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-    cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
-                                     cudaStream_t stream)
-{
-    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                                         - image_sum_g_ * templ_sum_scale_g;
+            }
+        }
 
-    static const caller_t callers[] = 
-    {
-        0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
-    };
+        void matchTemplatePrepared_CCOFF_8UC2(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, 
+                const DevMem2D_<unsigned int> image_sum_g,
+                unsigned int templ_sum_r, unsigned int templ_sum_g, 
+                DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-}
+            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+                    image_sum_r, image_sum_g, result);
+            cudaSafeCall( cudaGetLastError() );
 
-//////////////////////////////////////////////////////////////////////
-// Prepared_SQDIFF_NORMED
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-// normAcc* are accurate normalization routines which make GPU matchTemplate
-// consistent with CPU one
 
-__device__ float normAcc(float num, float denum)
-{
-    if (::fabs(num) < denum)
-        return num / denum;
-    if (::fabs(num) < denum * 1.125f)
-        return num > 0 ? 1 : -1;
-    return 0;
-}
 
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
+                int w, int h, 
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b;
+            }
+        }
 
-__device__ float normAcc_SQDIFF(float num, float denum)
-{
-    if (::fabs(num) < denum)
-        return num / denum;
-    if (::fabs(num) < denum * 1.125f)
-        return num > 0 ? 1 : -1;
-    return 1;
-}
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, 
+                const DevMem2D_<unsigned int> image_sum_g,
+                const DevMem2D_<unsigned int> image_sum_b,
+                unsigned int templ_sum_r, 
+                unsigned int templ_sum_g, 
+                unsigned int templ_sum_b, 
+                DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, 
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
 
-template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sqsum_ = (float)(
-                (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
-                                          sqrtf(image_sqsum_ * templ_sqsum));
-    }
-}
-
-template <int cn>
-void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
-                                            DevMem2Df result, cudaStream_t stream)
-{
-    const dim3 threads(32, 8);
-    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
+                int w, int h, 
+                float templ_sum_scale_r, 
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                float templ_sum_scale_a,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<unsigned int> image_sum_a,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-    cudaSafeCall( cudaGetLastError() );
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b
+                                         - image_sum_a_ * templ_sum_scale_a;
+            }
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, 
+                const DevMem2D_<unsigned int> image_sum_g,
+                const DevMem2D_<unsigned int> image_sum_b,
+                const DevMem2D_<unsigned int> image_sum_a,
+                unsigned int templ_sum_r, 
+                unsigned int templ_sum_g, 
+                unsigned int templ_sum_b, 
+                unsigned int templ_sum_a, 
+                DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, 
+                    (float)templ_sum_r / (w * h), 
+                    (float)templ_sum_g / (w * h), 
+                    (float)templ_sum_b / (w * h),
+                    (float)templ_sum_a / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF_NORMED
 
-void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
-                                            DevMem2Df result, int cn, cudaStream_t stream)
-{
-    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
-    static const caller_t callers[] = 
-    {
-        0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
-    };
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
+                int w, int h, float weight, 
+                float templ_sum_scale, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum, 
+                const PtrStep<unsigned long long> image_sqsum,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-}
+            if (x < result.cols && y < result.rows)
+            {
+                float ccorr = result.ptr(y)[x];
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+            }
+        }
 
-//////////////////////////////////////////////////////////////////////
-// Prepared_CCOFF
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                    int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                    const DevMem2D_<unsigned long long> image_sqsum,
+                    unsigned int templ_sum, unsigned int templ_sqsum,
+                    DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            float weight = 1.f / (w * h);
+            float templ_sum_scale = templ_sum * weight;
+            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_ = (float)(
-                (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
-    }
-}
-
-void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale, 
+                    image_sum, image_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
 
-    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
 
 
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
+                int w, int h, float weight, 
+                float templ_sum_scale_r, float templ_sum_scale_g, 
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
-        int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-        const PtrStep<unsigned int> image_sum_r,
-        const PtrStep<unsigned int> image_sum_g,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
-                                 - image_sum_g_ * templ_sum_scale_g;
-    }
-}
-
-void matchTemplatePrepared_CCOFF_8UC2(
-        int w, int h, 
-        const DevMem2D_<unsigned int> image_sum_r, 
-        const DevMem2D_<unsigned int> image_sum_g,
-        unsigned int templ_sum_r, unsigned int templ_sum_g, 
-        DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                    int w, int h, 
+                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                    unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                    DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, weight, 
+                    templ_sum_scale_r, templ_sum_scale_g,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r, 
+                    image_sum_g, image_sqsum_g, 
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
-            w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
-            image_sum_r, image_sum_g, result);
-    cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
 
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
+                int w, int h, float weight, 
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
 
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-        int w, int h, 
-        float templ_sum_scale_r,
-        float templ_sum_scale_g,
-        float templ_sum_scale_b,
-        const PtrStep<unsigned int> image_sum_r,
-        const PtrStep<unsigned int> image_sum_g,
-        const PtrStep<unsigned int> image_sum_b,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                    int w, int h, 
+                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                    unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned int templ_sqsum_b,
+                    DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, weight, 
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
+                    templ_sqsum_scale, 
+                    image_sum_r, image_sqsum_r, 
+                    image_sum_g, image_sqsum_g, 
+                    image_sum_b, image_sqsum_b, 
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float image_sum_b_ = (float)(
-                (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                 - image_sum_g_ * templ_sum_scale_g
-                                 - image_sum_b_ * templ_sum_scale_b;
-    }
-}
-
-void matchTemplatePrepared_CCOFF_8UC3(
-        int w, int h, 
-        const DevMem2D_<unsigned int> image_sum_r, 
-        const DevMem2D_<unsigned int> image_sum_g,
-        const DevMem2D_<unsigned int> image_sum_b,
-        unsigned int templ_sum_r, 
-        unsigned int templ_sum_g, 
-        unsigned int templ_sum_b, 
-        DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-            w, h, 
-            (float)templ_sum_r / (w * h),
-            (float)templ_sum_g / (w * h),
-            (float)templ_sum_b / (w * h),
-            image_sum_r, image_sum_g, image_sum_b, result);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-
-
-__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-        int w, int h, 
-        float templ_sum_scale_r, 
-        float templ_sum_scale_g,
-        float templ_sum_scale_b,
-        float templ_sum_scale_a,
-        const PtrStep<unsigned int> image_sum_r,
-        const PtrStep<unsigned int> image_sum_g,
-        const PtrStep<unsigned int> image_sum_b,
-        const PtrStep<unsigned int> image_sum_a,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float image_sum_b_ = (float)(
-                (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-        float image_sum_a_ = (float)(
-                (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
-                                 - image_sum_g_ * templ_sum_scale_g
-                                 - image_sum_b_ * templ_sum_scale_b
-                                 - image_sum_a_ * templ_sum_scale_a;
-    }
-}
-
-void matchTemplatePrepared_CCOFF_8UC4(
-        int w, int h, 
-        const DevMem2D_<unsigned int> image_sum_r, 
-        const DevMem2D_<unsigned int> image_sum_g,
-        const DevMem2D_<unsigned int> image_sum_b,
-        const DevMem2D_<unsigned int> image_sum_a,
-        unsigned int templ_sum_r, 
-        unsigned int templ_sum_g, 
-        unsigned int templ_sum_b, 
-        unsigned int templ_sum_a, 
-        DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-            w, h, 
-            (float)templ_sum_r / (w * h), 
-            (float)templ_sum_g / (w * h), 
-            (float)templ_sum_b / (w * h),
-            (float)templ_sum_a / (w * h),
-            image_sum_r, image_sum_g, image_sum_b, image_sum_a,
-            result);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-//////////////////////////////////////////////////////////////////////
-// Prepared_CCOFF_NORMED
-
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-        int w, int h, float weight, 
-        float templ_sum_scale, float templ_sqsum_scale,
-        const PtrStep<unsigned int> image_sum, 
-        const PtrStep<unsigned long long> image_sqsum,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (x < result.cols && y < result.rows)
-    {
-        float ccorr = result.ptr(y)[x];
-        float image_sum_ = (float)(
-                (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-        float image_sqsum_ = (float)(
-                (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
-                (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
-        result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
-                                   sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
-    }
-}
-
-void matchTemplatePrepared_CCOFF_NORMED_8U(
-            int w, int h, const DevMem2D_<unsigned int> image_sum, 
-            const DevMem2D_<unsigned long long> image_sqsum,
-            unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
+                int w, int h, float weight, 
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                float templ_sum_scale_a, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    float weight = 1.f / (w * h);
-    float templ_sum_scale = templ_sum * weight;
-    float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float image_sqsum_a_ = (float)(
+                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
+                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
 
-    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-            w, h, weight, templ_sum_scale, templ_sqsum_scale, 
-            image_sum, image_sqsum, result);
-    cudaSafeCall( cudaGetLastError() );
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                    int w, int h, 
+                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                    const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
+                    unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned int templ_sqsum_b,
+                    unsigned int templ_sum_a, unsigned int templ_sqsum_a,
+                    DevMem2Df result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sum_scale_a = templ_sum_a * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, weight, 
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
+                    templ_sqsum_scale, 
+                    image_sum_r, image_sqsum_r, 
+                    image_sum_g, image_sqsum_g, 
+                    image_sum_b, image_sqsum_b, 
+                    image_sum_a, image_sqsum_a, 
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        //////////////////////////////////////////////////////////////////////
+        // normalize
 
+        template <int cn>
+        __global__ void normalizeKernel_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum, 
+                unsigned int templ_sqsum, DevMem2Df result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
 
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-        int w, int h, float weight, 
-        float templ_sum_scale_r, float templ_sum_scale_g, 
-        float templ_sqsum_scale,
-        const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-        const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+                          unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sqsum_r_ = (float)(
-                (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float image_sqsum_g_ = (float)(
-                (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-
-        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                     - image_sum_g_ * templ_sum_scale_g;
-        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
-        result.ptr(y)[x] = normAcc(num, denum);
-    }
-}
-
-void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-    float weight = 1.f / (w * h);
-    float templ_sum_scale_r = templ_sum_r * weight;
-    float templ_sum_scale_g = templ_sum_g * weight;
-    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
-                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-            w, h, weight, 
-            templ_sum_scale_r, templ_sum_scale_g,
-            templ_sqsum_scale,
-            image_sum_r, image_sqsum_r, 
-            image_sum_g, image_sqsum_g, 
-            result);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-
-
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-        int w, int h, float weight, 
-        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
-        float templ_sqsum_scale,
-        const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-        const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-        const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            switch (cn)
+            {
+            case 1:
+                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 2:
+                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 3:
+                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 4:
+                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            }
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sqsum_r_ = (float)(
-                (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float image_sqsum_g_ = (float)(
-                (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-        float image_sum_b_ = (float)(
-                (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-        float image_sqsum_b_ = (float)(
-                (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-
-        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                     - image_sum_g_ * templ_sum_scale_g
-                                     - image_sum_b_ * templ_sum_scale_b;
-        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
-        result.ptr(y)[x] = normAcc(num, denum);
-    }
-}
-
-void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-    float weight = 1.f / (w * h);
-    float templ_sum_scale_r = templ_sum_r * weight;
-    float templ_sum_scale_g = templ_sum_g * weight;
-    float templ_sum_scale_b = templ_sum_b * weight;
-    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
-                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-            w, h, weight, 
-            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
-            templ_sqsum_scale, 
-            image_sum_r, image_sqsum_r, 
-            image_sum_g, image_sqsum_g, 
-            image_sum_b, image_sqsum_b, 
-            result);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-
-
-__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-        int w, int h, float weight, 
-        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
-        float templ_sum_scale_a, float templ_sqsum_scale,
-        const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-        const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-        const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-        const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
-        DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            cudaSafeCall( cudaGetLastError() );
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sum_r_ = (float)(
-                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-        float image_sqsum_r_ = (float)(
-                (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-        float image_sum_g_ = (float)(
-                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-        float image_sqsum_g_ = (float)(
-                (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-        float image_sum_b_ = (float)(
-                (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-        float image_sqsum_b_ = (float)(
-                (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-        float image_sum_a_ = (float)(
-                (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-        float image_sqsum_a_ = (float)(
-                (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
-                (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
-
-        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
-                                     - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
-        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
-                                                 + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
-        result.ptr(y)[x] = normAcc(num, denum);
-    }
-}
-
-void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-            const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-    float weight = 1.f / (w * h);
-    float templ_sum_scale_r = templ_sum_r * weight;
-    float templ_sum_scale_g = templ_sum_g * weight;
-    float templ_sum_scale_b = templ_sum_b * weight;
-    float templ_sum_scale_a = templ_sum_a * weight;
-    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
-                              + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-            w, h, weight, 
-            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
-            templ_sqsum_scale, 
-            image_sum_r, image_sqsum_r, 
-            image_sum_g, image_sqsum_g, 
-            image_sum_b, image_sqsum_b, 
-            image_sum_a, image_sqsum_a, 
-            result);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-//////////////////////////////////////////////////////////////////////
-// normalize
-
-template <int cn>
-__global__ void normalizeKernel_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (x < result.cols && y < result.rows)
-    {
-        float image_sqsum_ = (float)(
-                (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-        result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
-    }
-}
-
-void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                  unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        //////////////////////////////////////////////////////////////////////
+        // extractFirstChannel
 
-    switch (cn)
-    {
-    case 1:
-        normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
-
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-//////////////////////////////////////////////////////////////////////
-// extractFirstChannel
-
-template <int cn>
-__global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
-{
-    typedef typename TypeVec<float, cn>::vec_type Typef;
+        template <int cn>
+        __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
+        {
+            typedef typename TypeVec<float, cn>::vec_type Typef;
 
-    int x = blockDim.x * blockIdx.x + threadIdx.x;
-    int y = blockDim.y * blockIdx.y + threadIdx.y;
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (x < result.cols && y < result.rows)
-    {
-        Typef val = ((const Typef*)image.ptr(y))[x];
-        result.ptr(y)[x] = first(val);
-    }
-}
+            if (x < result.cols && y < result.rows)
+            {
+                Typef val = ((const Typef*)image.ptr(y))[x];
+                result.ptr(y)[x] = first(val);
+            }
+        }
 
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
-{
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
 
-    switch (cn)
-    {
-    case 1:
-        extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
-        break;
-    case 2:
-        extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
-        break;
-    case 3:
-        extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
-        break;
-    case 4:
-        extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-} //namespace match_template
-
-END_OPENCV_DEVICE_NAMESPACE
+            switch (cn)
+            {
+            case 1:
+                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 2:
+                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 3:
+                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 4:
+                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            }
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } //namespace match_template
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 3b427fc..c5b2e70 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -42,174 +42,172 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace mathfunc {
-
-//////////////////////////////////////////////////////////////////////////////////////
-// Cart <-> Polar
-
-struct Nothing
-{
-    static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-    {
-    }
-};
-struct Magnitude
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-    {
-        dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-    }
-};
-struct MagnitudeSqr
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-    {
-        dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-    }
-};
-struct Atan2
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-    {
-        float angle = ::atan2f(y_data, x_data);
-        angle += (angle < 0) * 2.0 * CV_PI;
-        dst[y * dst_step + x] = scale * angle;
-    }
-};
-template <typename Mag, typename Angle>
-__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
-                            float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-{
-	const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < width && y < height)
-    {
-        float x_data = xptr[y * x_step + x];
-        float y_data = yptr[y * y_step + x];
-
-        Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-        Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-    }
-}
-
-struct NonEmptyMag
+namespace cv { namespace gpu { namespace device 
 {
-    static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+    namespace mathfunc 
     {
-        return mag[y * mag_step + x];
-    }
-};
-struct EmptyMag
-{
-    static __device__ __forceinline__ float get(const float*, size_t, int, int)
-    {
-        return 1.0f;
-    }
-};
-template <typename Mag>
-__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-    float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-{
-	const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < width && y < height)
-    {
-        float mag_data = Mag::get(mag, mag_step, x, y);
-        float angle_data = angle[y * angle_step + x];
-        float sin_a, cos_a;
-
-        ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-        xptr[y * x_step + x] = mag_data * cos_a;
-        yptr[y * y_step + x] = mag_data * sin_a;
-    }
-}
-
-template <typename Mag, typename Angle>
-void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
 
-    grid.x = divUp(x.cols, threads.x);
-    grid.y = divUp(x.rows, threads.y);
-    
-    const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+        struct Nothing
+        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+            {
+            }
+        };
+        struct Magnitude
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0 * CV_PI;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-        x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
-        mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-    cudaSafeCall( cudaGetLastError() );
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+            }
+        }
 
-void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
-    typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-    static const caller_t callers[2][2][2] = 
-    {
+        struct NonEmptyMag
         {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
             {
-                cartToPolar_caller<Magnitude, Atan2>,
-                cartToPolar_caller<Magnitude, Nothing>
-            },
-            {
-                cartToPolar_caller<MagnitudeSqr, Atan2>,
-                cartToPolar_caller<MagnitudeSqr, Nothing>,
+                return mag[y * mag_step + x];
             }
-        },
+        };
+        struct EmptyMag
         {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
             {
-                cartToPolar_caller<Nothing, Atan2>,
-                cartToPolar_caller<Nothing, Nothing>
-            },
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
             {
-                cartToPolar_caller<Nothing, Atan2>,
-                cartToPolar_caller<Nothing, Nothing>,
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;
+
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
             }
         }
-    };
 
-    callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-}
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-template <typename Mag>
-void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+            
+            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
 
-    grid.x = divUp(mag.cols, threads.x);
-    grid.y = divUp(mag.rows, threads.y);
-    
-    const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );
 
-    polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
-        angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-    cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] = 
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
 
-void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
-    typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-    static const caller_t callers[2] = 
-    {
-        polarToCart_caller<NonEmptyMag>,
-        polarToCart_caller<EmptyMag>
-    };
+        template <typename Mag>
+        void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+            
+            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
 
-    callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-} // namespace mathfunc
+        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] = 
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
 
-END_OPENCV_DEVICE_NAMESPACE
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index df200ed..980ff1e 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -45,304 +45,303 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T> struct shift_and_sizeof;
-template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// CopyTo /////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-template<typename T>
-__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+namespace cv { namespace gpu { namespace device 
 {
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if ((x < cols * channels ) && (y < rows))
-        if (mask[y * step_mask + x / channels] != 0)
-        {
-            size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
-            mat_dst[idx] = mat_src[idx];
-        }
-}
-
-template<typename T>
-void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-{
-    dim3 threadsPerBlock(16,16, 1);
-    dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-
-    copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-            ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
-
-void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-{
-    typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
-
-    static CopyToFunc tab[8] =
+    template <typename T> struct shift_and_sizeof;
+    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
+
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// CopyTo /////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    template<typename T>
+    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
     {
-        copy_to_with_mask_run<unsigned char>,
-        copy_to_with_mask_run<signed char>,
-        copy_to_with_mask_run<unsigned short>,
-        copy_to_with_mask_run<short>,
-        copy_to_with_mask_run<int>,
-        copy_to_with_mask_run<float>,
-        copy_to_with_mask_run<double>,
-        0
-    };
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
+                mat_dst[idx] = mat_src[idx];
+            }
+    }
 
-    CopyToFunc func = tab[depth];
+    template<typename T>
+    void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(16,16, 1);
+        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
 
-    if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+        cudaSafeCall( cudaGetLastError() );
 
-    func(mat_src, mat_dst, mask, channels, stream);
-}
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
 
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// SetTo //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
+    void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+    {
+        typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
 
-__constant__ uchar scalar_8u[4];
-__constant__ schar scalar_8s[4];
-__constant__ ushort scalar_16u[4];
-__constant__ short scalar_16s[4];
-__constant__ int scalar_32s[4];
-__constant__ float scalar_32f[4]; 
-__constant__ double scalar_64f[4];
+        static CopyToFunc tab[8] =
+        {
+            copy_to_with_mask_run<unsigned char>,
+            copy_to_with_mask_run<signed char>,
+            copy_to_with_mask_run<unsigned short>,
+            copy_to_with_mask_run<short>,
+            copy_to_with_mask_run<int>,
+            copy_to_with_mask_run<float>,
+            copy_to_with_mask_run<double>,
+            0
+        };
 
-template <typename T> __device__ __forceinline__ T readScalar(int i);
-template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+        CopyToFunc func = tab[depth];
 
-void writeScalar(const uchar* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-}
-void writeScalar(const schar* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-}
-void writeScalar(const ushort* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-}
-void writeScalar(const short* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-}
-void writeScalar(const int* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-}
-void writeScalar(const float* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-}
-void writeScalar(const double* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
-}
+        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
 
-template<typename T>
-__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
-{
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+        func(mat_src, mat_dst, mask, channels, stream);
+    }
 
-    if ((x < cols * channels ) && (y < rows))
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// SetTo //////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    __constant__ uchar scalar_8u[4];
+    __constant__ schar scalar_8s[4];
+    __constant__ ushort scalar_16u[4];
+    __constant__ short scalar_16s[4];
+    __constant__ int scalar_32s[4];
+    __constant__ float scalar_32f[4]; 
+    __constant__ double scalar_64f[4];
+
+    template <typename T> __device__ __forceinline__ T readScalar(int i);
+    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+
+    void writeScalar(const uchar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
+    }
+    void writeScalar(const schar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
+    }
+    void writeScalar(const ushort* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
+    }
+    void writeScalar(const short* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
+    }
+    void writeScalar(const int* vals)
     {
-        size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-        mat[idx] = readScalar<T>(x % channels);
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
+    }
+    void writeScalar(const float* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
+    }
+    void writeScalar(const double* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
     }
-}
 
-template<typename T>
-__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-{
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+    template<typename T>
+    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if ((x < cols * channels ) && (y < rows))
-        if (mask[y * step_mask + x / channels] != 0)
+        if ((x < cols * channels ) && (y < rows))
         {
             size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
             mat[idx] = readScalar<T>(x % channels);
         }
-}
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
-{
-    writeScalar(scalar);
+    }
 
-    dim3 threadsPerBlock(32, 8, 1);
-    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+    template<typename T>
+    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+                mat[idx] = readScalar<T>(x % channels);
+            }
+    }
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
+    {
+        writeScalar(scalar);
 
-    set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
-    cudaSafeCall( cudaGetLastError() );
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
 
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
+        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+        cudaSafeCall( cudaGetLastError() );
 
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
 
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
-{
-    writeScalar(scalar);
+    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
 
-    dim3 threadsPerBlock(32, 8, 1);
-    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
+    {
+        writeScalar(scalar);
 
-    set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
-    cudaSafeCall( cudaGetLastError() );
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
 
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
+        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+        cudaSafeCall( cudaGetLastError() );
 
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
 
-///////////////////////////////////////////////////////////////////////////
-//////////////////////////////// ConvertTo ////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
+    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
 
-template <typename T, typename D> struct Convertor : unary_function<T, D>
-{
-    Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+    ///////////////////////////////////////////////////////////////////////////
+    //////////////////////////////// ConvertTo ////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
 
-    __device__ __forceinline__ D operator()(const T& src) const
+    template <typename T, typename D> struct Convertor : unary_function<T, D>
     {
-        return saturate_cast<D>(alpha * src + beta);
-    }
+        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
 
-    const double alpha, beta;
-};
+        __device__ __forceinline__ D operator()(const T& src) const
+        {
+            return saturate_cast<D>(alpha * src + beta);
+        }
 
-namespace detail
-{
-    template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
-    {
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_shift = 8 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        const double alpha, beta;
     };
 
-    template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+    namespace detail
     {
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_shift = 2 };
-    };
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
 
-    template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 2 };
-    };
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
 
-    template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
     {
     };
-}
-
-template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
-{
-};
-    
-template<typename T, typename D>
-void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-    cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-    Convertor<T, D> op(alpha, beta);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
-}
-
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
-    cudaStream_t stream = 0)
-{
-    typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
-        cudaStream_t stream);
+        
+    template<typename T, typename D>
+    void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+        Convertor<T, D> op(alpha, beta);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
+    }
 
-    static const caller_t tab[8][8] =
+    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
+        cudaStream_t stream = 0)
     {
-        {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-        cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
+            cudaStream_t stream);
 
-        {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-        cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+        static const caller_t tab[8][8] =
+        {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
 
-        {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-        cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
 
-        {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-        cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
 
-        {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-        cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
 
-        {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-        cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
 
-        {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-        cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
 
-        {0,0,0,0,0,0,0,0}
-    };
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
 
-    caller_t func = tab[sdepth][ddepth];
-    if (!func)
-        cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+            {0,0,0,0,0,0,0,0}
+        };
 
-    func(src, dst, alpha, beta, stream);
-}
+        caller_t func = tab[sdepth][ddepth];
+        if (!func)
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
 
-END_OPENCV_DEVICE_NAMESPACE
+        func(src, dst, alpha, beta, stream);
+    }
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index 618e94a..da697a7 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -46,2043 +46,2041 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace matrix_reductions {
-
-// Performs reduction in shared memory
-template <int size, typename T>
-__device__ void sumInSmem(volatile T* data, const uint tid)
-{
-    T sum = data[tid];
-
-    if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
-    if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
-    if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
-
-    if (tid < 32)
-    {
-        if (size >= 64) data[tid] = sum = sum + data[tid + 32];
-        if (size >= 32) data[tid] = sum = sum + data[tid + 16];
-        if (size >= 16) data[tid] = sum = sum + data[tid + 8];
-        if (size >= 8) data[tid] = sum = sum + data[tid + 4];
-        if (size >= 4) data[tid] = sum = sum + data[tid + 2];
-        if (size >= 2) data[tid] = sum = sum + data[tid + 1];
-    }
-}
-
-struct Mask8U
-{
-    explicit Mask8U(PtrStepb mask): mask(mask) {}
-
-    __device__ __forceinline__ bool operator()(int y, int x) const 
-    { 
-        return mask.ptr(y)[x]; 
-    }
-
-    PtrStepb mask;
-};
-
-struct MaskTrue 
-{ 
-    __device__ __forceinline__ bool operator()(int y, int x) const 
-    { 
-        return true; 
-    } 
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// Min max
-
-// To avoid shared bank conflicts we convert each value into value of 
-// appropriate type (32 bits minimum)
-template <typename T> struct MinMaxTypeTraits {};
-template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
-template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
-template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
-template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
-template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
-template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
-template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-
-namespace minmax 
+namespace cv { namespace gpu { namespace device 
 {
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-
-    // Global counter of blocks finished its work
-    __device__ uint blocks_finished = 0;
-
-
-    // Estimates good thread configuration
-    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = std::min(grid.x, threads.x);
-        grid.y = std::min(grid.y, threads.y);
-    }
-
-
-    // Returns required buffer sizes
-    void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * elem_size; 
-        bufrows = 2;
-    }
-
-
-    // Estimates device constants which are used in the kernels using specified thread configuration
-    void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-    }  
-
-
-    // Does min and max in shared memory
-    template <typename T>
-    __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
+    namespace matrix_reductions 
     {
-        minval[tid] = ::min(minval[tid], minval[tid + offset]);
-        maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
-    }
-
-
-    template <int size, typename T>
-    __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)
-    {
-        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); }  __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
-
-        if (tid < 32)
+        // Performs reduction in shared memory
+        template <int size, typename T>
+        __device__ void sumInSmem(volatile T* data, const uint tid)
         {
-            if (size >= 64) merge(tid, 32, minval, maxval);
-            if (size >= 32) merge(tid, 16, minval, maxval);
-            if (size >= 16) merge(tid, 8, minval, maxval);
-            if (size >= 8) merge(tid, 4, minval, maxval);
-            if (size >= 4) merge(tid, 2, minval, maxval);
-            if (size >= 2) merge(tid, 1, minval, maxval);
-        }
-    }
+            T sum = data[tid];
 
+            if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
+            if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
+            if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
 
-    template <int nthreads, typename T, typename Mask>
-    __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        T mymin = numeric_limits<T>::max();
-        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
-        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-        for (uint y = y0; y < y_end; y += blockDim.y)
-        {
-            const T* src_row = (const T*)src.ptr(y);
-            for (uint x = x0; x < x_end; x += blockDim.x)
-            {
-                T val = src_row[x];
-                if (mask(y, x)) 
-                { 
-                    mymin = ::min(mymin, val); 
-                    mymax = ::max(mymax, val); 
-                }
+            if (tid < 32)
+            {
+                if (size >= 64) data[tid] = sum = sum + data[tid + 32];
+                if (size >= 32) data[tid] = sum = sum + data[tid + 16];
+                if (size >= 16) data[tid] = sum = sum + data[tid + 8];
+                if (size >= 8) data[tid] = sum = sum + data[tid + 4];
+                if (size >= 4) data[tid] = sum = sum + data[tid + 2];
+                if (size >= 2) data[tid] = sum = sum + data[tid + 1];
             }
         }
 
-        sminval[tid] = mymin;
-        smaxval[tid] = mymax;
-        __syncthreads();
-
-        findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
-
-        if (tid == 0) 
+        struct Mask8U
         {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-        }
-
-#if __CUDA_ARCH__ >= 110
-		__shared__ bool is_last;
+            explicit Mask8U(PtrStepb mask): mask(mask) {}
 
-		if (tid == 0)
-		{
-			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-			__threadfence();
+            __device__ __forceinline__ bool operator()(int y, int x) const 
+            { 
+                return mask.ptr(y)[x]; 
+            }
 
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
-		}
+            PtrStepb mask;
+        };
+
+        struct MaskTrue 
+        { 
+            __device__ __forceinline__ bool operator()(int y, int x) const 
+            { 
+                return true; 
+            } 
+        };
+
+        //////////////////////////////////////////////////////////////////////////////
+        // Min max
+
+        // To avoid shared bank conflicts we convert each value into value of 
+        // appropriate type (32 bits minimum)
+        template <typename T> struct MinMaxTypeTraits {};
+        template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+        template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
+        template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+        template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+        template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+        template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+        template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+        namespace minmax 
+        {
+            __constant__ int ctwidth;
+            __constant__ int ctheight;
 
-		__syncthreads();
+            // Global counter of blocks finished its work
+            __device__ uint blocks_finished = 0;
 
-		if (is_last)
-		{
-            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
 
-            sminval[tid] = minval[idx];
-            smaxval[tid] = maxval[idx];
-            __syncthreads();
+            // Estimates good thread configuration
+            //  - threads variable satisfies to threads.x * threads.y == 256
+            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+            {
+                threads = dim3(32, 8);
+                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+                grid.x = std::min(grid.x, threads.x);
+                grid.y = std::min(grid.y, threads.y);
+            }
 
-			findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-            if (tid == 0) 
+            // Returns required buffer sizes
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
             {
-                minval[0] = (T)sminval[0];
-                maxval[0] = (T)smaxval[0];
-                blocks_finished = 0;
+                dim3 threads, grid;
+                estimateThreadCfg(cols, rows, threads, grid);
+                bufcols = grid.x * grid.y * elem_size; 
+                bufrows = 2;
             }
-		}
-#else
-        if (tid == 0) 
-        {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-        }
-#endif
-    }
 
-   
-    template <typename T>
-    void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
 
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
+            // Estimates device constants which are used in the kernels using specified thread configuration
+            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
+            {        
+                int twidth = divUp(divUp(cols, grid.x), threads.x);
+                int theight = divUp(divUp(rows, grid.y), threads.y);
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
+            }  
 
-        minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-        cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+            // Does min and max in shared memory
+            template <typename T>
+            __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
+            {
+                minval[tid] = ::min(minval[tid], minval[tid + offset]);
+                maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
+            }
 
-        T minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-    }  
 
-    template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template <int size, typename T>
+            __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)
+            {
+                if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
+                if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); }  __syncthreads(); }
+                if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
 
+                if (tid < 32)
+                {
+                    if (size >= 64) merge(tid, 32, minval, maxval);
+                    if (size >= 32) merge(tid, 16, minval, maxval);
+                    if (size >= 16) merge(tid, 8, minval, maxval);
+                    if (size >= 8) merge(tid, 4, minval, maxval);
+                    if (size >= 4) merge(tid, 2, minval, maxval);
+                    if (size >= 2) merge(tid, 1, minval, maxval);
+                }
+            }
 
-    template <typename T>
-    void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
 
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
+            template <int nthreads, typename T, typename Mask>
+            __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval)
+            {
+                typedef typename MinMaxTypeTraits<T>::best_type best_type;
+                __shared__ best_type sminval[nthreads];
+                __shared__ best_type smaxval[nthreads];
+
+                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+                T mymin = numeric_limits<T>::max();
+                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
+                uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+                uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+                for (uint y = y0; y < y_end; y += blockDim.y)
+                {
+                    const T* src_row = (const T*)src.ptr(y);
+                    for (uint x = x0; x < x_end; x += blockDim.x)
+                    {
+                        T val = src_row[x];
+                        if (mask(y, x)) 
+                        { 
+                            mymin = ::min(mymin, val); 
+                            mymax = ::max(mymax, val); 
+                        }
+                    }
+                }
 
-        minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-        cudaSafeCall( cudaGetLastError() );
+                sminval[tid] = mymin;
+                smaxval[tid] = mymax;
+                __syncthreads();
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-        T minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-    }  
+                if (tid == 0) 
+                {
+                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+                }
 
-    template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxCaller<float>(const DevMem2Db, double*,double*, PtrStepb);
-    template void minMaxCaller<double>(const DevMem2Db, double*, double*, PtrStepb);
+            #if __CUDA_ARCH__ >= 110
+		        __shared__ bool is_last;
 
+		        if (tid == 0)
+		        {
+			        minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+			        __threadfence();
 
-    template <int nthreads, typename T>
-    __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = ::min(tid, size - 1);
+			        uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			        is_last = ticket == gridDim.x * gridDim.y - 1;
+		        }
 
-        sminval[tid] = minval[idx];
-        smaxval[tid] = maxval[idx];
-        __syncthreads();
+		        __syncthreads();
 
-        findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
+		        if (is_last)
+		        {
+                    uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
 
-        if (tid == 0) 
-        {
-            minval[0] = (T)sminval[0];
-            maxval[0] = (T)smaxval[0];
-        }
-    }
+                    sminval[tid] = minval[idx];
+                    smaxval[tid] = maxval[idx];
+                    __syncthreads();
 
+			        findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
 
-    template <typename T>
-    void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+                    if (tid == 0) 
+                    {
+                        minval[0] = (T)sminval[0];
+                        maxval[0] = (T)smaxval[0];
+                        blocks_finished = 0;
+                    }
+		        }
+            #else
+                if (tid == 0) 
+                {
+                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+                }
+            #endif
+            }
 
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
+   
+            template <typename T>
+            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-        cudaSafeCall( cudaGetLastError() );
-        minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-        cudaSafeCall( cudaGetLastError() );
+                T* minval_buf = (T*)buf.ptr(0);
+                T* maxval_buf = (T*)buf.ptr(1);
 
-        cudaSafeCall(cudaDeviceSynchronize());
+                minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
+                cudaSafeCall( cudaGetLastError() );
 
-        T minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-    }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    template void minMaxMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
-    template void minMaxMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+                T minval_, maxval_;
+                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                *minval = minval_;
+                *maxval = maxval_;
+            }  
 
+            template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
 
-    template <typename T>
-    void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
-
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
-
-        minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-        cudaSafeCall( cudaGetLastError() );
-        minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-        T minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-    }
-
-    template void minMaxMultipassCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxMultipassCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxMultipassCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
-    template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);
-} // namespace minmax
-
-///////////////////////////////////////////////////////////////////////////////
-// minMaxLoc
-
-namespace minmaxloc 
-{
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
 
-    // Global counter of blocks finished its work
-    __device__ uint blocks_finished = 0;
+            template <typename T>
+            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
+                T* minval_buf = (T*)buf.ptr(0);
+                T* maxval_buf = (T*)buf.ptr(1);
 
-    // Estimates good thread configuration
-    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = std::min(grid.x, threads.x);
-        grid.y = std::min(grid.y, threads.y);
-    }
+                minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
+                cudaSafeCall( cudaGetLastError() );
 
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    // Returns required buffer sizes
-    void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
-                            int& b1rows, int& b2cols, int& b2rows)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(cols, rows, threads, grid);
-        b1cols = grid.x * grid.y * elem_size; // For values
-        b1rows = 2;
-        b2cols = grid.x * grid.y * sizeof(int); // For locations
-        b2rows = 2;
-    }
-
-
-    // Estimates device constants which are used in the kernels using specified thread configuration
-    void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-    }  
-
-
-    template <typename T>
-    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
-                          volatile uint* minloc, volatile uint* maxloc)
-    {
-        T val = minval[tid + offset];
-        if (val < minval[tid])
-        {
-            minval[tid] = val;
-            minloc[tid] = minloc[tid + offset];
-        }
-        val = maxval[tid + offset];
-        if (val > maxval[tid])
-        {
-            maxval[tid] = val;
-            maxloc[tid] = maxloc[tid + offset];
-        }
-    }
+                T minval_, maxval_;
+                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                *minval = minval_;
+                *maxval = maxval_;
+            }  
 
+            template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxCaller<float>(const DevMem2Db, double*,double*, PtrStepb);
+            template void minMaxCaller<double>(const DevMem2Db, double*, double*, PtrStepb);
 
-    template <int size, typename T>
-    __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
-                                        volatile uint* maxloc, const uint tid)
-    {
-        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); }  __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
 
-        if (tid < 32)
-        {
-            if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
-            if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
-            if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
-            if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
-            if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
-            if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
-        }
-    }
+            template <int nthreads, typename T>
+            __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)
+            {
+                typedef typename MinMaxTypeTraits<T>::best_type best_type;
+                __shared__ best_type sminval[nthreads];
+                __shared__ best_type smaxval[nthreads];
+                
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+                uint idx = ::min(tid, size - 1);
 
+                sminval[tid] = minval[idx];
+                smaxval[tid] = maxval[idx];
+                __syncthreads();
 
-    template <int nthreads, typename T, typename Mask>
-    __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, 
-                                    uint* minloc, uint* maxloc)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        __shared__ uint sminloc[nthreads];
-        __shared__ uint smaxloc[nthreads];
-
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        T mymin = numeric_limits<T>::max();
-        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); 
-        uint myminloc = 0;
-        uint mymaxloc = 0;
-        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-
-        for (uint y = y0; y < y_end; y += blockDim.y)
-        {
-            const T* ptr = (const T*)src.ptr(y);
-            for (uint x = x0; x < x_end; x += blockDim.x)
-            {
-                if (mask(y, x))
+                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
+
+                if (tid == 0) 
                 {
-                    T val = ptr[x];
-                    if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
-                    if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
+                    minval[0] = (T)sminval[0];
+                    maxval[0] = (T)smaxval[0];
                 }
             }
-        }
 
-        sminval[tid] = mymin; 
-        smaxval[tid] = mymax;
-        sminloc[tid] = myminloc;
-        smaxloc[tid] = mymaxloc;
-        __syncthreads();
 
-        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
-
-#if __CUDA_ARCH__ >= 110
-		__shared__ bool is_last;
-
-		if (tid == 0)
-		{
-			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-			__threadfence();
-
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
-		}
-
-		__syncthreads();
+            template <typename T>
+            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-		if (is_last)
-		{
-            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
+                T* minval_buf = (T*)buf.ptr(0);
+                T* maxval_buf = (T*)buf.ptr(1);
 
-            sminval[tid] = minval[idx];
-            smaxval[tid] = maxval[idx];
-            sminloc[tid] = minloc[idx];
-            smaxloc[tid] = maxloc[idx];
-            __syncthreads();
+                minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
+                cudaSafeCall( cudaGetLastError() );
+                minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
+                cudaSafeCall( cudaGetLastError() );
 
-			findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+                cudaSafeCall(cudaDeviceSynchronize());
 
-            if (tid == 0) 
-            {
-                minval[0] = (T)sminval[0];
-                maxval[0] = (T)smaxval[0];
-                minloc[0] = sminloc[0];
-                maxloc[0] = smaxloc[0];
-                blocks_finished = 0;
+                T minval_, maxval_;
+                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                *minval = minval_;
+                *maxval = maxval_;
             }
-		}
-#else
-        if (tid == 0) 
-        {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-        }
-#endif
-    }
 
+            template void minMaxMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
+            template void minMaxMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
 
-    template <typename T>
-    void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
-                             int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
-
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-        minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
-                                                           minloc_buf, maxloc_buf);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-        T minval_, maxval_;
-        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        *minval = minval_;
-        *maxval = maxval_;
-
-        uint minloc_, maxloc_;
-        cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-
-    template void minMaxLocMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-    template <typename T>
-    void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
-                         int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
-
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-        minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
-                                                             minloc_buf, maxloc_buf);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-
-    template void minMaxLocCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocCaller<double>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-    // This kernel will be used only when compute capability is 1.0
-    template <int nthreads, typename T>
-    __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        __shared__ uint sminloc[nthreads];
-        __shared__ uint smaxloc[nthreads];
 
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = ::min(tid, size - 1);
+            template <typename T>
+            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        sminval[tid] = minval[idx];
-        smaxval[tid] = maxval[idx];
-        sminloc[tid] = minloc[idx];
-        smaxloc[tid] = maxloc[idx];
-        __syncthreads();
+                T* minval_buf = (T*)buf.ptr(0);
+                T* maxval_buf = (T*)buf.ptr(1);
 
-        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+                minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
+                cudaSafeCall( cudaGetLastError() );
+                minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
+                cudaSafeCall( cudaGetLastError() );
 
-        if (tid == 0) 
-        {
-            minval[0] = (T)sminval[0];
-            maxval[0] = (T)smaxval[0];
-            minloc[0] = sminloc[0];
-            maxloc[0] = smaxloc[0];
-        }
-    }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
+                T minval_, maxval_;
+                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                *minval = minval_;
+                *maxval = maxval_;
+            }
 
-    template <typename T>
-    void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
-                                      int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
-
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-        minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
-                                                           minloc_buf, maxloc_buf);
-        cudaSafeCall( cudaGetLastError() );
-        minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-
-    template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-
-    template <typename T>
-    void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
-                                  int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
-
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
-        minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
-                                                             minloc_buf, maxloc_buf);
-        cudaSafeCall( cudaGetLastError() );
-        minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-
-    template void minMaxLocMultipassCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMultipassCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMultipassCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-    template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-} // namespace minmaxloc
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// countNonZero
-
-namespace countnonzero 
-{
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
+            template void minMaxMultipassCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<ushort>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
+            template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);
+        } // namespace minmax
 
-    __device__ uint blocks_finished = 0;
+        ///////////////////////////////////////////////////////////////////////////////
+        // minMaxLoc
 
-    void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = std::min(grid.x, threads.x);
-        grid.y = std::min(grid.y, threads.y);
-    }
+        namespace minmaxloc 
+        {
+            __constant__ int ctwidth;
+            __constant__ int ctheight;
 
+            // Global counter of blocks finished its work
+            __device__ uint blocks_finished = 0;
 
-    void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(int);
-        bufrows = 1;
-    }
 
+            // Estimates good thread configuration
+            //  - threads variable satisfies to threads.x * threads.y == 256
+            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+            {
+                threads = dim3(32, 8);
+                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+                grid.x = std::min(grid.x, threads.x);
+                grid.y = std::min(grid.y, threads.y);
+            }
 
-    void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
-    }
 
+            // Returns required buffer sizes
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+                                    int& b1rows, int& b2cols, int& b2rows)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(cols, rows, threads, grid);
+                b1cols = grid.x * grid.y * elem_size; // For values
+                b1rows = 2;
+                b2cols = grid.x * grid.y * sizeof(int); // For locations
+                b2rows = 2;
+            }
 
-    template <int nthreads, typename T>
-    __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count)
-    {
-        __shared__ uint scount[nthreads];
 
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+            // Estimates device constants which are used in the kernels using specified thread configuration
+            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
+            {        
+                int twidth = divUp(divUp(cols, grid.x), threads.x);
+                int theight = divUp(divUp(rows, grid.y), threads.y);
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
+            }  
 
-		uint cnt = 0;
-        for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-            for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-				cnt += ptr[x0 + x * blockDim.x] != 0;
-		}
 
-		scount[tid] = cnt;
-		__syncthreads();
+            template <typename T>
+            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
+                                  volatile uint* minloc, volatile uint* maxloc)
+            {
+                T val = minval[tid + offset];
+                if (val < minval[tid])
+                {
+                    minval[tid] = val;
+                    minloc[tid] = minloc[tid + offset];
+                }
+                val = maxval[tid + offset];
+                if (val > maxval[tid])
+                {
+                    maxval[tid] = val;
+                    maxloc[tid] = maxloc[tid + offset];
+                }
+            }
 
-        sumInSmem<nthreads, uint>(scount, tid);
 
-#if __CUDA_ARCH__ >= 110
-		__shared__ bool is_last;
+            template <int size, typename T>
+            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
+                                                volatile uint* maxloc, const uint tid)
+            {
+                if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
+                if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); }  __syncthreads(); }
+                if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
 
-		if (tid == 0)
-		{
-			count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-			__threadfence();
+                if (tid < 32)
+                {
+                    if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
+                    if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
+                    if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
+                    if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
+                    if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
+                    if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
+                }
+            }
 
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
-		}
 
-		__syncthreads();
+            template <int nthreads, typename T, typename Mask>
+            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, 
+                                            uint* minloc, uint* maxloc)
+            {
+                typedef typename MinMaxTypeTraits<T>::best_type best_type;
+                __shared__ best_type sminval[nthreads];
+                __shared__ best_type smaxval[nthreads];
+                __shared__ uint sminloc[nthreads];
+                __shared__ uint smaxloc[nthreads];
+
+                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+                T mymin = numeric_limits<T>::max();
+                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); 
+                uint myminloc = 0;
+                uint mymaxloc = 0;
+                uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+                uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+
+                for (uint y = y0; y < y_end; y += blockDim.y)
+                {
+                    const T* ptr = (const T*)src.ptr(y);
+                    for (uint x = x0; x < x_end; x += blockDim.x)
+                    {
+                        if (mask(y, x))
+                        {
+                            T val = ptr[x];
+                            if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
+                            if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
+                        }
+                    }
+                }
 
-		if (is_last)
-		{
-            scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
-            __syncthreads();
+                sminval[tid] = mymin; 
+                smaxval[tid] = mymax;
+                sminloc[tid] = myminloc;
+                smaxloc[tid] = mymaxloc;
+                __syncthreads();
+
+                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+
+            #if __CUDA_ARCH__ >= 110
+		        __shared__ bool is_last;
+
+		        if (tid == 0)
+		        {
+			        minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+                    minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
+                    maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
+			        __threadfence();
+
+			        uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			        is_last = ticket == gridDim.x * gridDim.y - 1;
+		        }
+
+		        __syncthreads();
+
+		        if (is_last)
+		        {
+                    uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
+
+                    sminval[tid] = minval[idx];
+                    smaxval[tid] = maxval[idx];
+                    sminloc[tid] = minloc[idx];
+                    smaxloc[tid] = maxloc[idx];
+                    __syncthreads();
+
+			        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+
+                    if (tid == 0) 
+                    {
+                        minval[0] = (T)sminval[0];
+                        maxval[0] = (T)smaxval[0];
+                        minloc[0] = sminloc[0];
+                        maxloc[0] = smaxloc[0];
+                        blocks_finished = 0;
+                    }
+		        }
+            #else
+                if (tid == 0) 
+                {
+                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+                    minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
+                    maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
+                }
+            #endif
+            }
 
-			sumInSmem<nthreads, uint>(scount, tid);
 
-			if (tid == 0) 
+            template <typename T>
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+                                     int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
             {
-                count[0] = scount[0];
-                blocks_finished = 0;
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
+
+                T* minval_buf = (T*)valbuf.ptr(0);
+                T* maxval_buf = (T*)valbuf.ptr(1);
+                uint* minloc_buf = (uint*)locbuf.ptr(0);
+                uint* maxloc_buf = (uint*)locbuf.ptr(1);
+
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                                                                   minloc_buf, maxloc_buf);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+                T minval_, maxval_;
+                cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+                *minval = minval_;
+                *maxval = maxval_;
+
+                uint minloc_, maxloc_;
+                cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
+                cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
+                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
             }
-		}
-#else
-        if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-#endif
-    }
 
-   
-    template <typename T>
-    int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+            template void minMaxLocMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskCaller<double>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
 
-        uint* count_buf = (uint*)buf.ptr(0);
 
-        countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
-        cudaSafeCall( cudaGetLastError() );
+            template <typename T>
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+                                 int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
+
+                T* minval_buf = (T*)valbuf.ptr(0);
+                T* maxval_buf = (T*)valbuf.ptr(1);
+                uint* minloc_buf = (uint*)locbuf.ptr(0);
+                uint* maxloc_buf = (uint*)locbuf.ptr(1);
+
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                                                                     minloc_buf, maxloc_buf);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+                T minval_, maxval_;
+                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                *minval = minval_;
+                *maxval = maxval_;
+
+                uint minloc_, maxloc_;
+                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+            }
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+            template void minMaxLocCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocCaller<double>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
 
-        uint count;
-        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        
-        return count;
-    }  
 
-    template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<ushort>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<short>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<int>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<float>(const DevMem2Db, PtrStepb);
-    template int countNonZeroCaller<double>(const DevMem2Db, PtrStepb);
+            // This kernel will be used only when compute capability is 1.0
+            template <int nthreads, typename T>
+            __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
+            {
+                typedef typename MinMaxTypeTraits<T>::best_type best_type;
+                __shared__ best_type sminval[nthreads];
+                __shared__ best_type smaxval[nthreads];
+                __shared__ uint sminloc[nthreads];
+                __shared__ uint smaxloc[nthreads];
 
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+                uint idx = ::min(tid, size - 1);
 
-    template <int nthreads, typename T>
-    __global__ void countNonZeroPass2Kernel(uint* count, int size)
-    {
-        __shared__ uint scount[nthreads];
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+                sminval[tid] = minval[idx];
+                smaxval[tid] = maxval[idx];
+                sminloc[tid] = minloc[idx];
+                smaxloc[tid] = maxloc[idx];
+                __syncthreads();
 
-        scount[tid] = tid < size ? count[tid] : 0;
-        __syncthreads();
+                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
 
-        sumInSmem<nthreads, uint>(scount, tid);
-
-        if (tid == 0) 
-            count[0] = scount[0];
-    }
+                if (tid == 0) 
+                {
+                    minval[0] = (T)sminval[0];
+                    maxval[0] = (T)smaxval[0];
+                    minloc[0] = sminloc[0];
+                    maxloc[0] = smaxloc[0];
+                }
+            }
 
 
-    template <typename T>
-    int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+            template <typename T>
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+                                              int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
+
+                T* minval_buf = (T*)valbuf.ptr(0);
+                T* maxval_buf = (T*)valbuf.ptr(1);
+                uint* minloc_buf = (uint*)locbuf.ptr(0);
+                uint* maxloc_buf = (uint*)locbuf.ptr(1);
+
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                                                                   minloc_buf, maxloc_buf);
+                cudaSafeCall( cudaGetLastError() );
+                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+                T minval_, maxval_;
+                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                *minval = minval_;
+                *maxval = maxval_;
+
+                uint minloc_, maxloc_;
+                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+            }
 
-        uint* count_buf = (uint*)buf.ptr(0);
+            template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<short>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<int>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMaskMultipassCaller<float>(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
 
-        countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
-        cudaSafeCall( cudaGetLastError() );
-        countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
-        cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+            template <typename T>
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+                                          int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
+
+                T* minval_buf = (T*)valbuf.ptr(0);
+                T* maxval_buf = (T*)valbuf.ptr(1);
+                uint* minloc_buf = (uint*)locbuf.ptr(0);
+                uint* maxloc_buf = (uint*)locbuf.ptr(1);
+
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                                                                     minloc_buf, maxloc_buf);
+                cudaSafeCall( cudaGetLastError() );
+                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+                T minval_, maxval_;
+                cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+                *minval = minval_;
+                *maxval = maxval_;
+
+                uint minloc_, maxloc_;
+                cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+                maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+            }
 
-        uint count;
-        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        
-        return count;
-    }  
+            template void minMaxLocMultipassCaller<uchar>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<char>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<ushort>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+            template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+        } // namespace minmaxloc
 
-    template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
-    template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
-    template int countNonZeroMultipassCaller<ushort>(const DevMem2Db, PtrStepb);
-    template int countNonZeroMultipassCaller<short>(const DevMem2Db, PtrStepb);
-    template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);
-    template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // countNonZero
 
-} // namespace countnonzero
+        namespace countnonzero 
+        {
+            __constant__ int ctwidth;
+            __constant__ int ctheight;
 
+            __device__ uint blocks_finished = 0;
 
-//////////////////////////////////////////////////////////////////////////
-// Sum
+            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+            {
+                threads = dim3(32, 8);
+                grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+                grid.x = std::min(grid.x, threads.x);
+                grid.y = std::min(grid.y, threads.y);
+            }
 
-namespace sum
-{
-    template <typename T> struct SumType {};
-    template <> struct SumType<uchar> { typedef uint R; };
-    template <> struct SumType<char> { typedef int R; };
-    template <> struct SumType<ushort> { typedef uint R; };
-    template <> struct SumType<short> { typedef int R; };
-    template <> struct SumType<int> { typedef int R; };
-    template <> struct SumType<float> { typedef float R; };
-    template <> struct SumType<double> { typedef double R; };
 
-    template <typename R> 
-    struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
+            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(cols, rows, threads, grid);
+                bufcols = grid.x * grid.y * sizeof(int);
+                bufrows = 1;
+            }
 
-    template <typename R> 
-    struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
 
-    template <>
-    struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
+            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
+            {        
+                int twidth = divUp(divUp(cols, grid.x), threads.x);
+                int theight = divUp(divUp(rows, grid.y), threads.y);
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+            }
 
-    template <typename R> 
-    struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
 
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-    __device__ uint blocks_finished = 0;
+            template <int nthreads, typename T>
+            __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count)
+            {
+                __shared__ uint scount[nthreads];
 
-    const int threads_x = 32;
-    const int threads_y = 8;
+                uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-    void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(threads_x, threads_y);
-        grid = dim3(divUp(cols, threads.x * threads.y), 
-                    divUp(rows, threads.y * threads.x));
-        grid.x = std::min(grid.x, threads.x);
-        grid.y = std::min(grid.y, threads.y);
-    }
+		        uint cnt = 0;
+                for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+                {
+                    const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
+                    for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+				        cnt += ptr[x0 + x * blockDim.x] != 0;
+		        }
+
+		        scount[tid] = cnt;
+		        __syncthreads();
+
+                sumInSmem<nthreads, uint>(scount, tid);
+
+            #if __CUDA_ARCH__ >= 110
+		        __shared__ bool is_last;
+
+		        if (tid == 0)
+		        {
+			        count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
+			        __threadfence();
+
+			        uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			        is_last = ticket == gridDim.x * gridDim.y - 1;
+		        }
+
+		        __syncthreads();
+
+		        if (is_last)
+		        {
+                    scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+                    __syncthreads();
+
+			        sumInSmem<nthreads, uint>(scount, tid);
+
+			        if (tid == 0) 
+                    {
+                        count[0] = scount[0];
+                        blocks_finished = 0;
+                    }
+		        }
+            #else
+                if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
+            #endif
+            }
 
+           
+            template <typename T>
+            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-    void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimateThreadCfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(double) * cn;
-        bufrows = 1;
-    }
-
-
-    void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
-    }
-
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel(const DevMem2Db src, R* result)
-    {
-        __shared__ R smem[nthreads];
+                uint* count_buf = (uint*)buf.ptr(0);
 
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+                countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
+                cudaSafeCall( cudaGetLastError() );
 
-        R sum = 0;
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                sum += Op::call(ptr[x0 + x * blockDim.x]);
-        }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-        smem[tid] = sum;
-        __syncthreads();
+                uint count;
+                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                
+                return count;
+            }  
 
-        sumInSmem<nthreads, R>(smem, tid);
+            template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<ushort>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<short>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<int>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<float>(const DevMem2Db, PtrStepb);
+            template int countNonZeroCaller<double>(const DevMem2Db, PtrStepb);
 
-#if __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
 
-        if (tid == 0)
-        {
-            result[bid] = smem[0];
-            __threadfence();
+            template <int nthreads, typename T>
+            __global__ void countNonZeroPass2Kernel(uint* count, int size)
+            {
+                __shared__ uint scount[nthreads];
+                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
+                scount[tid] = tid < size ? count[tid] : 0;
+                __syncthreads();
 
-        __syncthreads();
+                sumInSmem<nthreads, uint>(scount, tid);
 
-        if (is_last)
-        {
-            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
-            __syncthreads();
+                if (tid == 0) 
+                    count[0] = scount[0];
+            }
 
-            sumInSmem<nthreads, R>(smem, tid);
 
-            if (tid == 0) 
+            template <typename T>
+            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf)
             {
-                result[0] = smem[0];
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) result[bid] = smem[0];
-#endif
-    }
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
+                uint* count_buf = (uint*)buf.ptr(0);
 
-    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel(R* result, int size)
-    {
-        __shared__ R smem[nthreads];
-        int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        smem[tid] = tid < size ? result[tid] : 0;
-        __syncthreads();
+                countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
+                cudaSafeCall( cudaGetLastError() );
+                countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
+                cudaSafeCall( cudaGetLastError() );
 
-        sumInSmem<nthreads, R>(smem, tid);
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-        if (tid == 0) 
-            result[0] = smem[0];
-    }
+                uint count;
+                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
+                
+                return count;
+            }  
 
+            template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<ushort>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<short>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);
+            template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);
 
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec<R, 2>::vec_type* result)
-    {
-        typedef typename TypeVec<T, 2>::vec_type SrcType;
-        typedef typename TypeVec<R, 2>::vec_type DstType;
+        } // namespace countnonzero
 
-        __shared__ R smem[nthreads * 2];
 
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+        //////////////////////////////////////////////////////////////////////////
+        // Sum
 
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        namespace sum
         {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
-            }
-        }
+            template <typename T> struct SumType {};
+            template <> struct SumType<uchar> { typedef uint R; };
+            template <> struct SumType<char> { typedef int R; };
+            template <> struct SumType<ushort> { typedef uint R; };
+            template <> struct SumType<short> { typedef int R; };
+            template <> struct SumType<int> { typedef int R; };
+            template <> struct SumType<float> { typedef float R; };
+            template <> struct SumType<double> { typedef double R; };
 
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        __syncthreads();
+            template <typename R> 
+            struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
+            template <typename R> 
+            struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
 
-#if __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
+            template <>
+            struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
 
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[bid] = res;
-            __threadfence();
-
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-
-        __syncthreads();
+            template <typename R> 
+            struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
 
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            __syncthreads();
+            __constant__ int ctwidth;
+            __constant__ int ctheight;
+            __device__ uint blocks_finished = 0;
 
-            sumInSmem<nthreads, R>(smem, tid);
-            sumInSmem<nthreads, R>(smem + nthreads, tid);
+            const int threads_x = 32;
+            const int threads_y = 8;
 
-            if (tid == 0) 
+            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
             {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                result[0] = res;
-                blocks_finished = 0;
+                threads = dim3(threads_x, threads_y);
+                grid = dim3(divUp(cols, threads.x * threads.y), 
+                            divUp(rows, threads.y * threads.x));
+                grid.x = std::min(grid.x, threads.x);
+                grid.y = std::min(grid.y, threads.y);
             }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
 
 
-    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
-    {
-        typedef typename TypeVec<R, 2>::vec_type DstType;
+            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)
+            {
+                dim3 threads, grid;
+                estimateThreadCfg(cols, rows, threads, grid);
+                bufcols = grid.x * grid.y * sizeof(double) * cn;
+                bufrows = 1;
+            }
 
-        __shared__ R smem[nthreads * 2];
 
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
+            {        
+                int twidth = divUp(divUp(cols, grid.x), threads.x);
+                int theight = divUp(divUp(rows, grid.y), threads.y);
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+            }
 
-        DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        __syncthreads();
+            template <typename T, typename R, typename Op, int nthreads>
+            __global__ void sumKernel(const DevMem2Db src, R* result)
+            {
+                __shared__ R smem[nthreads];
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
+                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[0] = res;
-        }
-    }
+                R sum = 0;
+                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+                {
+                    const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
+                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                        sum += Op::call(ptr[x0 + x * blockDim.x]);
+                }
 
+                smem[tid] = sum;
+                __syncthreads();
 
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec<R, 3>::vec_type* result)
-    {
-        typedef typename TypeVec<T, 3>::vec_type SrcType;
-        typedef typename TypeVec<R, 3>::vec_type DstType;
+                sumInSmem<nthreads, R>(smem, tid);
 
-        __shared__ R smem[nthreads * 3];
+            #if __CUDA_ARCH__ >= 110
+                __shared__ bool is_last;
 
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+                if (tid == 0)
+                {
+                    result[bid] = smem[0];
+                    __threadfence();
 
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
-            }
-        }
+                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+                    is_last = (ticket == gridDim.x * gridDim.y - 1);
+                }
 
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        smem[tid + 2 * nthreads] = sum.z;
-        __syncthreads();
+                __syncthreads();
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                if (is_last)
+                {
+                    smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
+                    __syncthreads();
 
-#if __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
+                    sumInSmem<nthreads, R>(smem, tid);
 
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[bid] = res;
-            __threadfence();
-
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
+                    if (tid == 0) 
+                    {
+                        result[0] = smem[0];
+                        blocks_finished = 0;
+                    }
+                }
+            #else
+                if (tid == 0) result[bid] = smem[0];
+            #endif
+            }
 
-        __syncthreads();
 
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            smem[tid + 2 * nthreads] = res.z;
-            __syncthreads();
+            template <typename T, typename R, int nthreads>
+            __global__ void sumPass2Kernel(R* result, int size)
+            {
+                __shared__ R smem[nthreads];
+                int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-            sumInSmem<nthreads, R>(smem, tid);
-            sumInSmem<nthreads, R>(smem + nthreads, tid);
-            sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                smem[tid] = tid < size ? result[tid] : 0;
+                __syncthreads();
 
-            if (tid == 0) 
-            {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                res.z = smem[2 * nthreads];
-                result[0] = res;
-                blocks_finished = 0;
+                sumInSmem<nthreads, R>(smem, tid);
+
+                if (tid == 0) 
+                    result[0] = smem[0];
             }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
 
 
-    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
-    {
-        typedef typename TypeVec<R, 3>::vec_type DstType;
+            template <typename T, typename R, typename Op, int nthreads>
+            __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec<R, 2>::vec_type* result)
+            {
+                typedef typename TypeVec<T, 2>::vec_type SrcType;
+                typedef typename TypeVec<R, 2>::vec_type DstType;
 
-        __shared__ R smem[nthreads * 3];
+                __shared__ R smem[nthreads * 2];
 
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 
-        DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        smem[tid + 2 * nthreads] = res.z;
-        __syncthreads();
+                SrcType val;
+                DstType sum = VecTraits<DstType>::all(0);
+                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+                {
+                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                    {
+                        val = ptr[x0 + x * blockDim.x];
+                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
+                    }
+                }
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                smem[tid] = sum.x;
+                smem[tid + nthreads] = sum.y;
+                __syncthreads();
 
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[0] = res;
-        }
-    }
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
 
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec<R, 4>::vec_type* result)
-    {
-        typedef typename TypeVec<T, 4>::vec_type SrcType;
-        typedef typename TypeVec<R, 4>::vec_type DstType;
+            #if __CUDA_ARCH__ >= 110
+                __shared__ bool is_last;
 
-        __shared__ R smem[nthreads * 4];
+                if (tid == 0)
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    result[bid] = res;
+                    __threadfence();
+
+                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+                    is_last = (ticket == gridDim.x * gridDim.y - 1);
+                }
 
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+                __syncthreads();
 
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
-                                                     Op::call(val.z), Op::call(val.w));
+                if (is_last)
+                {
+                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+                    smem[tid] = res.x;
+                    smem[tid + nthreads] = res.y;
+                    __syncthreads();
+
+                    sumInSmem<nthreads, R>(smem, tid);
+                    sumInSmem<nthreads, R>(smem + nthreads, tid);
+
+                    if (tid == 0) 
+                    {
+                        res.x = smem[0];
+                        res.y = smem[nthreads];
+                        result[0] = res;
+                        blocks_finished = 0;
+                    }
+                }
+            #else
+                if (tid == 0) 
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    result[bid] = res;
+                }
+            #endif
             }
-        }
 
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        smem[tid + 2 * nthreads] = sum.z;
-        smem[tid + 3 * nthreads] = sum.w;
-        __syncthreads();
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
+            template <typename T, typename R, int nthreads>
+            __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
+            {
+                typedef typename TypeVec<R, 2>::vec_type DstType;
 
-#if __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
+                __shared__ R smem[nthreads * 2];
 
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[bid] = res;
-            __threadfence();
-
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        __syncthreads();
+                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
+                smem[tid] = res.x;
+                smem[tid + nthreads] = res.y;
+                __syncthreads();
 
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            smem[tid + 2 * nthreads] = res.z;
-            smem[tid + 3 * nthreads] = res.w;
-            __syncthreads();
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
+
+                if (tid == 0) 
+                {
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    result[0] = res;
+                }
+            }
 
-            sumInSmem<nthreads, R>(smem, tid);
-            sumInSmem<nthreads, R>(smem + nthreads, tid);
-            sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-            sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-            if (tid == 0) 
+            template <typename T, typename R, typename Op, int nthreads>
+            __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec<R, 3>::vec_type* result)
             {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                res.z = smem[2 * nthreads];
-                res.w = smem[3 * nthreads];
-                result[0] = res;
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
+                typedef typename TypeVec<T, 3>::vec_type SrcType;
+                typedef typename TypeVec<R, 3>::vec_type DstType;
 
+                __shared__ R smem[nthreads * 3];
 
-    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
-    {
-        typedef typename TypeVec<R, 4>::vec_type DstType;
+                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 
-        __shared__ R smem[nthreads * 4];
+                SrcType val;
+                DstType sum = VecTraits<DstType>::all(0);
+                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+                {
+                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                    {
+                        val = ptr[x0 + x * blockDim.x];
+                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
+                    }
+                }
 
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                smem[tid] = sum.x;
+                smem[tid + nthreads] = sum.y;
+                smem[tid + 2 * nthreads] = sum.z;
+                __syncthreads();
 
-        DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        smem[tid + 2 * nthreads] = res.z;
-        smem[tid + 3 * nthreads] = res.w;
-        __syncthreads();
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
 
-        sumInSmem<nthreads, R>(smem, tid);
-        sumInSmem<nthreads, R>(smem + nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-        sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
+            #if __CUDA_ARCH__ >= 110
+                __shared__ bool is_last;
 
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[0] = res;
-        }
-    }
+                if (tid == 0)
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    result[bid] = res;
+                    __threadfence();
+
+                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+                    is_last = (ticket == gridDim.x * gridDim.y - 1);
+                }
 
-    template <typename T>
-    void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+                __syncthreads();
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+                if (is_last)
+                {
+                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+                    smem[tid] = res.x;
+                    smem[tid + nthreads] = res.y;
+                    smem[tid + 2 * nthreads] = res.z;
+                    __syncthreads();
+
+                    sumInSmem<nthreads, R>(smem, tid);
+                    sumInSmem<nthreads, R>(smem + nthreads, tid);
+                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+
+                    if (tid == 0) 
+                    {
+                        res.x = smem[0];
+                        res.y = smem[nthreads];
+                        res.z = smem[2 * nthreads];
+                        result[0] = res;
+                        blocks_finished = 0;
+                    }
+                }
+            #else
+                if (tid == 0) 
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    result[bid] = res;
+                }
+            #endif
+            }
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
 
-            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+            template <typename T, typename R, int nthreads>
+            __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
+            {
+                typedef typename TypeVec<R, 3>::vec_type DstType;
 
-            break;
-        case 2:
-            sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                __shared__ R smem[nthreads * 3];
 
-            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-            break;
-        case 3:
-            sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
+                smem[tid] = res.x;
+                smem[tid + nthreads] = res.y;
+                smem[tid + 2 * nthreads] = res.z;
+                __syncthreads();
 
-            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
 
-            break;
-        case 4:
-            sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                if (tid == 0) 
+                {
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    result[0] = res;
+                }
+            }
 
-            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+            template <typename T, typename R, typename Op, int nthreads>
+            __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec<R, 4>::vec_type* result)
+            {
+                typedef typename TypeVec<T, 4>::vec_type SrcType;
+                typedef typename TypeVec<R, 4>::vec_type DstType;
 
-            break;
-        }
-        cudaSafeCall( cudaDeviceSynchronize() );
+                __shared__ R smem[nthreads * 4];
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+                const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+                const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const int bid = blockIdx.y * gridDim.x + blockIdx.x;
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
+                SrcType val;
+                DstType sum = VecTraits<DstType>::all(0);
+                for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+                {
+                    const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                    {
+                        val = ptr[x0 + x * blockDim.x];
+                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
+                                                             Op::call(val.z), Op::call(val.w));
+                    }
+                }
 
-    template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+                smem[tid] = sum.x;
+                smem[tid + nthreads] = sum.y;
+                smem[tid + 2 * nthreads] = sum.z;
+                smem[tid + 3 * nthreads] = sum.w;
+                __syncthreads();
 
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-    template <typename T>
-    void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+            #if __CUDA_ARCH__ >= 110
+                __shared__ bool is_last;
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+                if (tid == 0)
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    res.w = smem[3 * nthreads];
+                    result[bid] = res;
+                    __threadfence();
+
+                    uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+                    is_last = (ticket == gridDim.x * gridDim.y - 1);
+                }
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            break;
-        case 2:
-            sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            break;
-        case 3:
-            sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            break;
-        case 4:
-            sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            break;
-        }
-        cudaSafeCall( cudaGetLastError() );
+                __syncthreads();
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+                if (is_last)
+                {
+                    DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+                    smem[tid] = res.x;
+                    smem[tid + nthreads] = res.y;
+                    smem[tid + 2 * nthreads] = res.z;
+                    smem[tid + 3 * nthreads] = res.w;
+                    __syncthreads();
+
+                    sumInSmem<nthreads, R>(smem, tid);
+                    sumInSmem<nthreads, R>(smem + nthreads, tid);
+                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                    sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
+
+                    if (tid == 0) 
+                    {
+                        res.x = smem[0];
+                        res.y = smem[nthreads];
+                        res.z = smem[2 * nthreads];
+                        res.w = smem[3 * nthreads];
+                        result[0] = res;
+                        blocks_finished = 0;
+                    }
+                }
+            #else
+                if (tid == 0) 
+                {
+                    DstType res;
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    res.w = smem[3 * nthreads];
+                    result[bid] = res;
+                }
+            #endif
+            }
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
+            template <typename T, typename R, int nthreads>
+            __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
+            {
+                typedef typename TypeVec<R, 4>::vec_type DstType;
 
-    template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void sumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+                __shared__ R smem[nthreads * 4];
 
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-    template <typename T>
-    void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+                DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
+                smem[tid] = res.x;
+                smem[tid + nthreads] = res.y;
+                smem[tid + 2 * nthreads] = res.z;
+                smem[tid + 3 * nthreads] = res.w;
+                __syncthreads();
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+                sumInSmem<nthreads, R>(smem, tid);
+                sumInSmem<nthreads, R>(smem + nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
+                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                if (tid == 0) 
+                {
+                    res.x = smem[0];
+                    res.y = smem[nthreads];
+                    res.z = smem[2 * nthreads];
+                    res.w = smem[3 * nthreads];
+                    result[0] = res;
+                }
+            }
 
-            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+            template <typename T>
+            void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-            break;
-        case 2:
-            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-            break;
-        case 3:
-            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
-            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }  
 
-            break;
-        case 4:
-            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+            template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
 
-            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
 
-            break;
-        }
-        cudaSafeCall( cudaDeviceSynchronize() );
+            template <typename T>
+            void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    break;
+                }
+                cudaSafeCall( cudaGetLastError() );
 
-    template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+                cudaSafeCall( cudaDeviceSynchronize() );
 
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
-    template <typename T>
-    void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }  
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
+            template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void sumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            break;
-        case 2:
-            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            break;
-        case 3:
-            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            break;
-        case 4:
-            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            break;
-        }
-        cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+            template <typename T>
+            void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    template void absSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void absSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }  
 
-    template <typename T>
-    void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+            template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+            template <typename T>
+            void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-            break;
-        case 2:
-            sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    break;
+                }
+                cudaSafeCall( cudaGetLastError() );
 
-            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-            break;
-        case 3:
-            sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
-            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }
 
-            break;
-        case 4:
-            sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            cudaSafeCall( cudaGetLastError() );
+            template void absSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void absSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
 
-            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
-            cudaSafeCall( cudaGetLastError() );
 
-            break;
-        }
-        cudaSafeCall( cudaDeviceSynchronize() );
+            template <typename T>
+            void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    cudaSafeCall( cudaGetLastError() );
+
+                    sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                            (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    break;
+                }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }  
 
-    template <typename T>
-    void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
-    {
-        typedef typename SumType<T>::R R;
+            template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumMultipassCaller<float>(const DevMem2Db, PtrStepb, double*, int);
 
-        dim3 threads, grid;
-        estimateThreadCfg(src.cols, src.rows, threads, grid);
-        setKernelConsts(src.cols, src.rows, threads, grid);
 
-        switch (cn)
-        {
-        case 1:
-            sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
-            break;
-        case 2:
-            sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
-            break;
-        case 3:
-            sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
-            break;
-        case 4:
-            sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
-            break;
-        }
-        cudaSafeCall( cudaGetLastError() );
+            template <typename T>
+            void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
+            {
+                typedef typename SumType<T>::R R;
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+                dim3 threads, grid;
+                estimateThreadCfg(src.cols, src.rows, threads, grid);
+                setKernelConsts(src.cols, src.rows, threads, grid);
 
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+                switch (cn)
+                {
+                case 1:
+                    sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
+                    break;
+                case 2:
+                    sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
+                    break;
+                case 3:
+                    sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
+                    break;
+                case 4:
+                    sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                            src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
+                    break;
+                }
+                cudaSafeCall( cudaGetLastError() );
 
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    template void sqrSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
-    template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
-} // namespace sum
+                R result[4] = {0, 0, 0, 0};
+                cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
 
-//////////////////////////////////////////////////////////////////////////////
-// reduce
+                sum[0] = result[0];
+                sum[1] = result[1];
+                sum[2] = result[2];
+                sum[3] = result[3];
+            }
 
-template <typename S> struct SumReductor
-{
-    __device__ __forceinline__ S startValue() const
-    {
-        return 0;
-    }
+            template void sqrSumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<ushort>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
+            template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+        } // namespace sum
 
-    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-    {
-        return a + b;
-    }
+        //////////////////////////////////////////////////////////////////////////////
+        // reduce
 
-    __device__ __forceinline__ S result(S r, double) const
-    {
-        return r;
-    }
-};
+        template <typename S> struct SumReductor
+        {
+            __device__ __forceinline__ S startValue() const
+            {
+                return 0;
+            }
 
-template <typename S> struct AvgReductor
-{
-    __device__ __forceinline__ S startValue() const
-    {
-        return 0;
-    }
+            __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
+            {
+                return a + b;
+            }
 
-    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-    {
-        return a + b;
-    }
+            __device__ __forceinline__ S result(S r, double) const
+            {
+                return r;
+            }
+        };
 
-    __device__ __forceinline__ double result(S r, double sz) const
-    {
-        return r / sz;
-    }
-};
+        template <typename S> struct AvgReductor
+        {
+            __device__ __forceinline__ S startValue() const
+            {
+                return 0;
+            }
 
-template <typename S> struct MinReductor
-{
-    __device__ __forceinline__ S startValue() const
-    {
-        return numeric_limits<S>::max();
-    }
+            __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
+            {
+                return a + b;
+            }
 
-    template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
-    {
-        return saturate_cast<T>(::min(a, b));
-    }
-    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-    {
-        return ::fmin(a, b);
-    }
+            __device__ __forceinline__ double result(S r, double sz) const
+            {
+                return r / sz;
+            }
+        };
 
-    __device__ __forceinline__ S result(S r, double) const
-    {
-        return r;
-    }
-};
+        template <typename S> struct MinReductor
+        {
+            __device__ __forceinline__ S startValue() const
+            {
+                return numeric_limits<S>::max();
+            }
 
-template <typename S> struct MaxReductor
-{
-    __device__ __forceinline__ S startValue() const
-    {
-        return numeric_limits<S>::min();
-    }
+            template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
+            {
+                return saturate_cast<T>(::min(a, b));
+            }
+            __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+            {
+                return ::fmin(a, b);
+            }
 
-    template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
-    {
-        return ::max(a, b);
-    }
-    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-    {
-        return ::fmax(a, b);
-    }
+            __device__ __forceinline__ S result(S r, double) const
+            {
+                return r;
+            }
+        };
 
-    __device__ __forceinline__ S result(S r, double) const
-    {
-        return r;
-    }
-};
+        template <typename S> struct MaxReductor
+        {
+            __device__ __forceinline__ S startValue() const
+            {
+                return numeric_limits<S>::min();
+            }
 
-template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
-{
-    __shared__ S smem[16 * 16];
+            template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
+            {
+                return ::max(a, b);
+            }
+            __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+            {
+                return ::fmax(a, b);
+            }
 
-    const int x = blockIdx.x * 16 + threadIdx.x;
+            __device__ __forceinline__ S result(S r, double) const
+            {
+                return r;
+            }
+        };
 
-    S myVal = op.startValue();
+        template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
+        {
+            __shared__ S smem[16 * 16];
 
-    if (x < src.cols)
-    {
-        for (int y = threadIdx.y; y < src.rows; y += 16)
-            myVal = op(myVal, src.ptr(y)[x]);
-    }        
+            const int x = blockIdx.x * 16 + threadIdx.x;
 
-    smem[threadIdx.x * 16 + threadIdx.y] = myVal;
-    __syncthreads();
+            S myVal = op.startValue();
 
-    if (threadIdx.x < 8)
-    {
-        volatile S* srow = smem + threadIdx.y * 16;
-        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
-        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
-        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
-        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
-    }
-    __syncthreads();
-
-    if (threadIdx.y == 0 && x < src.cols)
-        dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
-}
-
-template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
-{
-    const dim3 block(16, 16);
-    const dim3 grid(divUp(src.cols, block.x));
+            if (x < src.cols)
+            {
+                for (int y = threadIdx.y; y < src.rows; y += 16)
+                    myVal = op(myVal, src.ptr(y)[x]);
+            }        
 
-    Op<S> op;
-    reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-    cudaSafeCall( cudaGetLastError() );
+            smem[threadIdx.x * 16 + threadIdx.y] = myVal;
+            __syncthreads();
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            if (threadIdx.x < 8)
+            {
+                volatile S* srow = smem + threadIdx.y * 16;
+                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
+                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
+                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
+                srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
+            }
+            __syncthreads();
 
-}
+            if (threadIdx.y == 0 && x < src.cols)
+                dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
+        }
 
-template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+        template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(src.cols, block.x));
 
-    static const caller_t callers[] = 
-    {
-        reduceRows_caller<SumReductor, T, S, D>, 
-        reduceRows_caller<AvgReductor, T, S, D>, 
-        reduceRows_caller<MaxReductor, T, S, D>, 
-        reduceRows_caller<MinReductor, T, S, D>
-    };
+            Op<S> op;
+            reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+            cudaSafeCall( cudaGetLastError() );
 
-    callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        }
 
-template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
 
-template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+            static const caller_t callers[] = 
+            {
+                reduceRows_caller<SumReductor, T, S, D>, 
+                reduceRows_caller<AvgReductor, T, S, D>, 
+                reduceRows_caller<MaxReductor, T, S, D>, 
+                reduceRows_caller<MinReductor, T, S, D>
+            };
 
-template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+            callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+        }
 
-template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
 
+        template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
 
+        template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
 
-template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
-{
-    __shared__ S smem[256 * cn];
+        template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-    const int y = blockIdx.x;
+        template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-    const T* src_row = src.ptr(y);
 
-    S myVal[cn];
 
-    #pragma unroll
-    for (int c = 0; c < cn; ++c)
-        myVal[c] = op.startValue();
+        template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
+        {
+            __shared__ S smem[256 * cn];
 
-#if __CUDA_ARCH__ >= 200
+            const int y = blockIdx.x;
 
-    // For cc >= 2.0 prefer L1 cache
-    for (int x = threadIdx.x; x < src.cols; x += 256)
-    {
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-            myVal[c] = op(myVal[c], src_row[x * cn + c]);
-    }
+            const T* src_row = src.ptr(y);
 
-#else // __CUDA_ARCH__ >= 200
+            S myVal[cn];
 
-    // For older arch use shared memory for cache
-    for (int x = 0; x < src.cols; x += 256)
-    {
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-        {
-            smem[c * 256 + threadIdx.x] = op.startValue();
-            const int load_x = x * cn + c * 256 + threadIdx.x;
-            if (load_x < src.cols * cn)
-                smem[c * 256 + threadIdx.x] = src_row[load_x];
-        }
-        __syncthreads();
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                myVal[c] = op.startValue();
 
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-            myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
-        __syncthreads();
-    }
+        #if __CUDA_ARCH__ >= 200
 
-#endif // __CUDA_ARCH__ >= 200
+            // For cc >= 2.0 prefer L1 cache
+            for (int x = threadIdx.x; x < src.cols; x += 256)
+            {
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                    myVal[c] = op(myVal[c], src_row[x * cn + c]);
+            }
 
-    #pragma unroll
-    for (int c = 0; c < cn; ++c)
-        smem[c * 256 + threadIdx.x] = myVal[c];
-    __syncthreads();
+        #else // __CUDA_ARCH__ >= 200
 
-    if (threadIdx.x < 128)
-    {
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
-    }
-    __syncthreads();
+            // For older arch use shared memory for cache
+            for (int x = 0; x < src.cols; x += 256)
+            {
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                {
+                    smem[c * 256 + threadIdx.x] = op.startValue();
+                    const int load_x = x * cn + c * 256 + threadIdx.x;
+                    if (load_x < src.cols * cn)
+                        smem[c * 256 + threadIdx.x] = src_row[load_x];
+                }
+                __syncthreads();
 
-    if (threadIdx.x < 64)
-    {
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
-    }
-    __syncthreads();
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                    myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
+                __syncthreads();
+            }
 
-    volatile S* sdata = smem;
+        #endif // __CUDA_ARCH__ >= 200
 
-    if (threadIdx.x < 32)
-    {
-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-        {
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
-            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
-        }
-    }
-    __syncthreads();
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                smem[c * 256 + threadIdx.x] = myVal[c];
+            __syncthreads();
 
-    if (threadIdx.x < cn)
-        dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
-}
+            if (threadIdx.x < 128)
+            {
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                    smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
+            }
+            __syncthreads();
 
-template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
-{
-    const dim3 block(256);
-    const dim3 grid(src.rows);
+            if (threadIdx.x < 64)
+            {
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                    smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
+            }
+            __syncthreads();
 
-    Op<S> op;
-    reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-    cudaSafeCall( cudaGetLastError() );
+            volatile S* sdata = smem;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            if (threadIdx.x < 32)
+            {
+                #pragma unroll
+                for (int c = 0; c < cn; ++c)
+                {
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
+                    sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
+                }
+            }
+            __syncthreads();
 
-}
+            if (threadIdx.x < cn)
+                dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
+        }
 
-template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+        template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(src.rows);
 
-    static const caller_t callers[4][4] = 
-    {
-        {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
-        {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
-        {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
-        {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
-    };
+            Op<S> op;
+            reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+            cudaSafeCall( cudaGetLastError() );
 
-    callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        }
+
+        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
 
-template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+            static const caller_t callers[4][4] = 
+            {
+                {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
+                {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
+                {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
+                {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
+            };
+
+            callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+        }
 
-template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
-template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
 
-} // namespace mattrix_reductions
+        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+    } // namespace mattrix_reductions
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu
index 75cb70d..34716c2 100644
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -46,142 +46,140 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
-{
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y;
-
-    __shared__ value_type smem[256 + 4];
-
-    value_type sum;
-    
-    const int src_y = 2*y;
-
-    sum = VecTraits<value_type>::all(0);
-    
-    sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
-    sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
-    sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
-    sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
-    sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
-
-    smem[2 + threadIdx.x] = sum;
-
-    if (threadIdx.x < 2)
-    {
-        const int left_x = x - 2 + threadIdx.x;
-
-        sum = VecTraits<value_type>::all(0);
-    
-        sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
-
-        smem[threadIdx.x] = sum;
-    }
-
-    if (threadIdx.x > 253)
-    {
-        const int right_x = x + threadIdx.x + 2;
-
-        sum = VecTraits<value_type>::all(0);
-    
-        sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
-
-        smem[4 + threadIdx.x] = sum;
-    }
-
-    __syncthreads();
-
-    if (threadIdx.x < 128)
-    {
-        const int tid2 = threadIdx.x * 2;
-
-        sum = VecTraits<value_type>::all(0);
-
-        sum = sum + 0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-        const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-        if (dst_x < dst_cols)
-            dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-    }
-}
-
-template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+namespace cv { namespace gpu { namespace device 
 {
-    const dim3 block(256);
-    const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-    B<T> b(src.rows, src.cols);
-
-    pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-{
-    typedef typename TypeVec<T, cn>::vec_type type;
-
-    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-    static const caller_t callers[] = 
+    namespace imgproc 
     {
-        pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
-    };
-
-    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-}
-
-template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
 
-template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
 
-template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            __shared__ value_type smem[256 + 4];
 
-template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            value_type sum;
+            
+            const int src_y = 2*y;
 
-template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            sum = VecTraits<value_type>::all(0);
+            
+            sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
+
+            smem[2 + threadIdx.x] = sum;
 
-} // namespace imgproc
+            if (threadIdx.x < 2)
+            {
+                const int left_x = x - 2 + threadIdx.x;
 
-END_OPENCV_DEVICE_NAMESPACE
+                sum = VecTraits<value_type>::all(0);
+            
+                sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
+                sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
+                sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
+
+                smem[threadIdx.x] = sum;
+            }
+
+            if (threadIdx.x > 253)
+            {
+                const int right_x = x + threadIdx.x + 2;
+
+                sum = VecTraits<value_type>::all(0);
+            
+                sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
+                sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
+                sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
+
+                smem[4 + threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                sum = VecTraits<value_type>::all(0);
+
+                sum = sum + 0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type type;
+
+            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+            static const caller_t callers[] = 
+            {
+                pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
+            };
+
+            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu
index 0d24581..dd91bd4 100644
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -46,137 +46,135 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+    namespace imgproc 
+    {
+        template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
 
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    __shared__ T smem1[10][10];
-    __shared__ value_type smem2[20][16];
+            __shared__ T smem1[10][10];
+            __shared__ value_type smem2[20][16];
 
-    value_type sum;
+            value_type sum;
 
-    if (threadIdx.x < 10 && threadIdx.y < 10)
-        smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+                smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
 
-    __syncthreads();
+            __syncthreads();
 
-    const int tidx = threadIdx.x;
+            const int tidx = threadIdx.x;
 
-    sum = VecTraits<value_type>::all(0);
+            sum = VecTraits<value_type>::all(0);
 
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
-    sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
 
-    smem2[2 + threadIdx.y][tidx] = sum;
+            smem2[2 + threadIdx.y][tidx] = sum;
 
-    if (threadIdx.y < 2)
-    {
-        sum = VecTraits<value_type>::all(0);
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<value_type>::all(0);
 
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
 
-        smem2[threadIdx.y][tidx] = sum;
-    }
+                smem2[threadIdx.y][tidx] = sum;
+            }
 
-    if (threadIdx.y > 13)
-    {
-        sum = VecTraits<value_type>::all(0);
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<value_type>::all(0);
 
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
 
-        smem2[4 + threadIdx.y][tidx] = sum;
-    }
+                smem2[4 + threadIdx.y][tidx] = sum;
+            }
 
-    __syncthreads();
+            __syncthreads();
 
-    sum = VecTraits<value_type>::all(0);
+            sum = VecTraits<value_type>::all(0);
 
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
-    sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
 
-    if (x < dst.cols && y < dst.rows)
-        dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
-}
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+        }
 
-template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    const dim3 block(16, 16);
-    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 
-    B<T> b(src.rows, src.cols);
+            B<T> b(src.rows, src.cols);
 
-    pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
-    cudaSafeCall( cudaGetLastError() );
+            pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-{
-    typedef typename TypeVec<T, cn>::vec_type type;
-
-    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-
-    static const caller_t callers[] = 
-    {
-        pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
-    };
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-}
+        template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type type;
 
-template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
 
-template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            static const caller_t callers[] = 
+            {
+                pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
+            };
 
-template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+        }
 
-template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
-template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
-template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
-} // namespace imgproc
+        template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
index 650357d..e2cdc3b 100644
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -47,208 +47,206 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-    
-template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = mapx.ptr(y)[x];
-        const float ycoo = mapy.ptr(y)[x];
-
-        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-    }
-}
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+namespace cv { namespace gpu { namespace device 
 {
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-        remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-{
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-        remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-
-#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-    texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_remap_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_remap_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
-        { \
-            typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-            dim3 block(32, cc >= 20 ? 8 : 4); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-            BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-            Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-{ 
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int cc)
-    {
-        if (stream == 0)
-            RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
-        else
-            RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
-    }
-};
-
-template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
-    int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int cc);
+    namespace imgproc 
+    {    
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
 
-    static const caller_t callers[3][5] = 
-    {
-        { 
-            RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<PointFilter, BrdConstant, T>::call, 
-            RemapDispatcher<PointFilter, BrdReflect, T>::call, 
-            RemapDispatcher<PointFilter, BrdWrap, T>::call 
-        },
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x, y); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , src); \
+                    tex_remap_ ## type ##_reader texSrc; \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , src); \
+                    tex_remap_ ## type ##_reader texSrc; \
+                    Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+            
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
         { 
-            RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
-            RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
-            RemapDispatcher<LinearFilter, BrdWrap, T>::call 
-        },
-        { 
-            RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
-            RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
-            RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
+            int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int cc);
+
+            static const caller_t callers[3][5] = 
+            {
+                { 
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call 
+                },
+                { 
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+                },
+                { 
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
         }
-    };
-
-    callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
-}
-
-template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-} // namespace imgproc
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index 4294dc1..34c8f66 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -47,219 +47,217 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-    
-template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = x / fx;
-        const float ycoo = y / fy;
-
-        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-    }
-}
-template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = x / fx;
-        const float ycoo = y / fy;
-
-        dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
-    }
-}
-
-template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-        resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
-template <typename T> struct ResizeDispatcherStream<PointFilter, T>
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-
-        resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
-
-template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-        resize<<<grid, block>>>(filter_src, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+namespace cv { namespace gpu { namespace device 
 {
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-
-        resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-
-#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-    texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_resize_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_resize_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
-            resize<<<grid, block>>>(filter_src, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <> struct ResizeDispatcherNonStream<PointFilter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-template <template <typename> class Filter, typename T> struct ResizeDispatcher
-{ 
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        if (stream == 0)
-            ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
-        else
-            ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-    }
-};
-
-template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
-
-    static const caller_t callers[3] = 
-    {
-        ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
-    };
-
-    callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
-}
-
-template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-} // namespace imgproc
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {    
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x / fx;
+                const float ycoo = y / fy;
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+        template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x / fx;
+                const float ycoo = y / fy;
+
+                dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+
+                resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+
+                resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+
+                resize<<<grid, block>>>(filter_src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+
+                resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type , x, y); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type , src); \
+                    tex_resize_ ## type ##_reader texSrc; \
+                    Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
+                    resize<<<grid, block>>>(filter_src, fx, fy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <> struct ResizeDispatcherNonStream<PointFilter, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type , src); \
+                    tex_resize_ ## type ##_reader texSrc; \
+                    resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+            
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        { 
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+            static const caller_t callers[3] = 
+            {
+                ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
+            };
+
+            callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
+        }
+
+        template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
index e185669..0855d2b 100644
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -47,226 +47,224 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 4
-#define RESULT_STEPS 8
-#define HALO_STEPS 1
-
-namespace row_filter {
-
-__constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-void loadKernel(const float kernel[], int ksize)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-}
+    #define MAX_KERNEL_SIZE 16
+    #define BLOCK_DIM_X 16
+    #define BLOCK_DIM_Y 4
+    #define RESULT_STEPS 8
+    #define HALO_STEPS 1
 
-namespace detail
-{
-    template <typename T, size_t size> struct SmemType
+    namespace row_filter 
     {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
-    };
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
 
-    template <typename T> struct SmemType<T, 4>
-    {
-        typedef T smem_t;
-    };
-}
+        void loadKernel(const float kernel[], int ksize)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
+        }
 
-template <typename T> struct SmemType
-{
-    typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
-};
+        namespace detail
+        {
+            template <typename T, size_t size> struct SmemType
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
+            };
+
+            template <typename T> struct SmemType<T, 4>
+            {
+                typedef T smem_t;
+            };
+        }
 
-template <int KERNEL_SIZE, typename T, typename D, typename B>
-__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-{
-    typedef typename SmemType<T>::smem_t smem_t;
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+        template <typename T> struct SmemType
+        {
+            typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
+        };
 
-    __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
+        template <int KERNEL_SIZE, typename T, typename D, typename B>
+        __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+        {
+            typedef typename SmemType<T>::smem_t smem_t;
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
 
-    //Offset to the left halo edge
-    const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
-    const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+            __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
 
-    if (y < src.rows)
-    {
-        const T* src_row = src.ptr(y);
+            //Offset to the left halo edge
+            const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
+            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
 
-        //Load main data
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+            if (y < src.rows)
+            {
+                const T* src_row = src.ptr(y);
 
-        //Load left halo
-        #pragma unroll
-        for(int i = 0; i < HALO_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
+                //Load main data
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
 
-        //Load right halo
-        #pragma unroll
-        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+                //Load left halo
+                #pragma unroll
+                for(int i = 0; i < HALO_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
 
-        __syncthreads();
+                //Load right halo
+                #pragma unroll
+                for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
 
-        D* dst_row = dst.ptr(y);
+                __syncthreads();
 
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-        {
-            sum_t sum = VecTraits<sum_t>::all(0);
+                D* dst_row = dst.ptr(y);
+
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
 
-            #pragma unroll
-            for (int j = 0; j < KERNEL_SIZE; ++j)
-                sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
+                    #pragma unroll
+                    for (int j = 0; j < KERNEL_SIZE; ++j)
+                        sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
 
-            int dstX = x + i * BLOCK_DIM_X;
+                    int dstX = x + i * BLOCK_DIM_X;
 
-            if (dstX < src.cols)
-                dst_row[dstX] = saturate_cast<D>(sum);
+                    if (dstX < src.cols)
+                        dst_row[dstX] = saturate_cast<D>(sum);
+                }
+            }
         }
-    }
-}
 
-template <int ksize, typename T, typename D, template<typename> class B>
-void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-{
-    typedef typename SmemType<T>::smem_t smem_t;
+        template <int ksize, typename T, typename D, template<typename> class B>
+        void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+        {
+            typedef typename SmemType<T>::smem_t smem_t;
 
-    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-    const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
 
-    B<smem_t> b(src.cols);
+            B<smem_t> b(src.cols);
 
-    linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-    cudaSafeCall( cudaGetLastError() );
+            linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <typename T, typename D>
-void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-    static const caller_t callers[5][17] = 
-    {
+        template <typename T, typename D>
+        void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
         {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-            linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<16, T, D, BrdRowReflect101>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
-            linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
-            linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-            linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<16, T, D, BrdRowReplicate>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<2 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<8 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<9 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<10, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<11, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<12, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<13, T, D, BrdRowConstant>,
-            linearRowFilter_caller<14, T, D, BrdRowConstant>,
-            linearRowFilter_caller<15, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<16, T, D, BrdRowConstant>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<8 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<9 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<10, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<11, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<12, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<13, T, D, BrdRowReflect>,
-            linearRowFilter_caller<14, T, D, BrdRowReflect>,
-            linearRowFilter_caller<15, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<16, T, D, BrdRowReflect>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<2 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<8 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<9 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<10, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<11, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<12, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<13, T, D, BrdRowWrap>,
-            linearRowFilter_caller<14, T, D, BrdRowWrap>,
-            linearRowFilter_caller<15, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<16, T, D, BrdRowWrap>
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+            static const caller_t callers[5][17] = 
+            {
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReflect101>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReplicate>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<9 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<10, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<11, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<12, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<15, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<16, T, D, BrdRowConstant>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReflect>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<9 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<10, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<11, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<12, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<15, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<16, T, D, BrdRowWrap>
+                }
+            };
+            
+            loadKernel(kernel, ksize);
+
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
         }
-    };
-    
-    loadKernel(kernel, ksize);
-
-    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-}
-
-template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-
-} // namespace row_filter
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    } // namespace row_filter
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp
index a3dc2f4..a48b7a2 100644
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -62,44 +62,43 @@
     #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif
 
-namespace cv { namespace gpu {
-
-void error(const char *error_string, const char *file, const int line, const char *func = "");
-void nppError(int err, const char *file, const int line, const char *func = "");
-void ncvError(int err, const char *file, const int line, const char *func = "");
-void cufftError(int err, const char *file, const int line, const char *func = "");
-void cublasError(int err, const char *file, const int line, const char *func = "");
-
-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+namespace cv { namespace gpu 
 {
-    if (cudaSuccess != err)
-        cv::gpu::error(cudaGetErrorString(err), file, line, func);
-}
+    void error(const char *error_string, const char *file, const int line, const char *func = "");
+    void nppError(int err, const char *file, const int line, const char *func = "");
+    void ncvError(int err, const char *file, const int line, const char *func = "");
+    void cufftError(int err, const char *file, const int line, const char *func = "");
+    void cublasError(int err, const char *file, const int line, const char *func = "");
 
-static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (err < 0)
-        cv::gpu::nppError(err, file, line, func);
-}
+    static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
 
-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (NCV_SUCCESS != err)
-        cv::gpu::ncvError(err, file, line, func);
-}
+    static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+            cv::gpu::nppError(err, file, line, func);
+    }
 
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
+    static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (NCV_SUCCESS != err)
+            cv::gpu::ncvError(err, file, line, func);
+    }
 
-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
+    static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+    {
+        if (CUFFT_SUCCESS != err)
+            cv::gpu::cufftError(err, file, line, func);
+    }
 
+    static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+    {
+        if (CUBLAS_STATUS_SUCCESS != err)
+            cv::gpu::cublasError(err, file, line, func);
+    }
 }}
 
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu
index 92700ed..82da7ef 100644
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -42,467 +42,465 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace split_merge {
-
-template <typename T, size_t elem_size = sizeof(T)>
-struct TypeTraits 
-{
-    typedef T type;
-    typedef T type2;
-    typedef T type3;
-    typedef T type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 1>
-{
-    typedef char type;
-    typedef char2 type2;
-    typedef char3 type3;
-    typedef char4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 2>
-{
-    typedef short type;
-    typedef short2 type2;
-    typedef short3 type3;
-    typedef short4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 4> 
-{
-    typedef int type;
-    typedef int2 type2;
-    typedef int3 type3;
-    typedef int4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 8> 
-{
-    typedef double type;
-    typedef double2 type2;
-    //typedef double3 type3;
-    //typedef double4 type3;
-};
-
-typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
-typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
-
-//------------------------------------------------------------
-// Merge    
-
-template <typename T>
-__global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type2 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <typename T>
-__global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type3 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    const T* src2_y = (const T*)(src2 + y * src2_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_elem.z = src2_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <>
-__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double* src0_y = (const double*)(src0 + y * src0_step);
-    const double* src1_y = (const double*)(src1 + y * src1_step);
-    const double* src2_y = (const double*)(src2 + y * src2_step);
-    double* dst_y = (double*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_y[3 * x] = src0_y[x];
-        dst_y[3 * x + 1] = src1_y[x];
-        dst_y[3 * x + 2] = src2_y[x];
-    }
-}
-
-
-template <typename T>
-__global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         const uchar* src3, size_t src3_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type4 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    const T* src2_y = (const T*)(src2 + y * src2_step);
-    const T* src3_y = (const T*)(src3 + y * src3_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_elem.z = src2_y[x];
-        dst_elem.w = src3_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <>
-__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         const uchar* src3, size_t src3_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double* src0_y = (const double*)(src0 + y * src0_step);
-    const double* src1_y = (const double*)(src1 + y * src1_step);
-    const double* src2_y = (const double*)(src2 + y * src2_step);
-    const double* src3_y = (const double*)(src3 + y * src3_step);
-    double2* dst_y = (double2*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-        dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-    }
-}
-
-
-template <typename T>
-static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            src[2].data, src[2].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            src[2].data, src[2].step,
-            src[3].data, src[3].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
-                             int total_channels, size_t elem_size,
-                             const cudaStream_t& stream)
-{
-    static MergeFunction merge_func_tbl[] =
-    {
-        mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-        mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-        mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-    };
-
-    size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-    MergeFunction merge_func = merge_func_tbl[merge_func_id];
-
-    if (merge_func == 0)
-        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-    merge_func(src, dst, stream);
-}
-
-
-
-//------------------------------------------------------------
-// Split
-
-
-template <typename T>
-__global__ void splitC2_(const uchar* src, size_t src_step, 
-                        int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step)
-{
-    typedef typename TypeTraits<T>::type2 src_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
-
-    if (x < cols && y < rows) 
-    {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-    }
-}
-
-
-template <typename T>
-__global__ void splitC3_(const uchar* src, size_t src_step, 
-                        int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step,
-                        uchar* dst2, size_t dst2_step)
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename TypeTraits<T>::type3 src_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
-    T* dst2_y = (T*)(dst2 + y * dst2_step);
-
-    if (x < cols && y < rows) 
+    namespace split_merge 
     {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-        dst2_y[x] = src_elem.z;
-    }
-}
-
-
-template <>
-__global__ void splitC3_<double>(
-        const uchar* src, size_t src_step, int rows, int cols,
-        uchar* dst0, size_t dst0_step,
-        uchar* dst1, size_t dst1_step,
-        uchar* dst2, size_t dst2_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double* src_y = (const double*)(src + y * src_step);
-    double* dst0_y = (double*)(dst0 + y * dst0_step);
-    double* dst1_y = (double*)(dst1 + y * dst1_step);
-    double* dst2_y = (double*)(dst2 + y * dst2_step);
-
-    if (x < cols && y < rows) 
-    {
-        dst0_y[x] = src_y[3 * x];
-        dst1_y[x] = src_y[3 * x + 1];
-        dst2_y[x] = src_y[3 * x + 2];
-    }
-}
-
-
-template <typename T>
-__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step,
-                        uchar* dst2, size_t dst2_step,
-                        uchar* dst3, size_t dst3_step)
-{
-    typedef typename TypeTraits<T>::type4 src_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
-    T* dst2_y = (T*)(dst2 + y * dst2_step);
-    T* dst3_y = (T*)(dst3 + y * dst3_step);
-
-    if (x < cols && y < rows) 
-    {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-        dst2_y[x] = src_elem.z;
-        dst3_y[x] = src_elem.w;
-    }
-}
-
-
-template <>
-__global__ void splitC4_<double>(
-        const uchar* src, size_t src_step, int rows, int cols,
-        uchar* dst0, size_t dst0_step,
-        uchar* dst1, size_t dst1_step,
-        uchar* dst2, size_t dst2_step,
-        uchar* dst3, size_t dst3_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double2* src_y = (const double2*)(src + y * src_step);
-    double* dst0_y = (double*)(dst0 + y * dst0_step);
-    double* dst1_y = (double*)(dst1 + y * dst1_step);
-    double* dst2_y = (double*)(dst2 + y * dst2_step);
-    double* dst3_y = (double*)(dst3 + y * dst3_step);
-
-    if (x < cols && y < rows) 
-    {
-        double2 src_elem1 = src_y[2 * x];
-        double2 src_elem2 = src_y[2 * x + 1];
-        dst0_y[x] = src_elem1.x;
-        dst1_y[x] = src_elem1.y;
-        dst2_y[x] = src_elem2.x;
-        dst3_y[x] = src_elem2.y;
-    }
-}
-
-template <typename T>
-static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
-            src.data, src.step, src.rows, src.cols,
-            dst[0].data, dst[0].step,
-            dst[1].data, dst[1].step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
-            src.data, src.step, src.rows, src.cols,
-            dst[0].data, dst[0].step,
-            dst[1].data, dst[1].step,
-            dst[2].data, dst[2].step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
-             src.data, src.step, src.rows, src.cols,
-             dst[0].data, dst[0].step,
-             dst[1].data, dst[1].step,
-             dst[2].data, dst[2].step,
-             dst[3].data, dst[3].step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
-{
-    static SplitFunction split_func_tbl[] =
-    {
-        splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-        splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-        splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-    };
-
-    size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-    SplitFunction split_func = split_func_tbl[split_func_id];
-
-    if (split_func == 0)
-        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-    split_func(src, dst, stream);
-}
-
-} // namespace split_merge
-
-END_OPENCV_DEVICE_NAMESPACE
+        template <typename T, size_t elem_size = sizeof(T)>
+        struct TypeTraits 
+        {
+            typedef T type;
+            typedef T type2;
+            typedef T type3;
+            typedef T type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 1>
+        {
+            typedef char type;
+            typedef char2 type2;
+            typedef char3 type3;
+            typedef char4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 2>
+        {
+            typedef short type;
+            typedef short2 type2;
+            typedef short3 type3;
+            typedef short4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 4> 
+        {
+            typedef int type;
+            typedef int2 type2;
+            typedef int3 type3;
+            typedef int4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 8> 
+        {
+            typedef double type;
+            typedef double2 type2;
+            //typedef double3 type3;
+            //typedef double4 type3;
+        };
+
+        typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
+        typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+
+        //------------------------------------------------------------
+        // Merge    
+
+        template <typename T>
+        __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type2 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type3 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            double* dst_y = (double*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_y[3 * x] = src0_y[x];
+                dst_y[3 * x + 1] = src1_y[x];
+                dst_y[3 * x + 2] = src2_y[x];
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 const uchar* src3, size_t src3_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type4 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            const T* src3_y = (const T*)(src3 + y * src3_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_elem.w = src3_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 const uchar* src3, size_t src3_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            const double* src3_y = (const double*)(src3 + y * src3_step);
+            double2* dst_y = (double2*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+            }
+        }
+
+
+        template <typename T>
+        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    src[3].data, src[3].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
+                                     int total_channels, size_t elem_size,
+                                     const cudaStream_t& stream)
+        {
+            static MergeFunction merge_func_tbl[] =
+            {
+                mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+                mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+                mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
+            };
+
+            size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+            MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+            if (merge_func == 0)
+                cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+            merge_func(src, dst, stream);
+        }
+
+
+
+        //------------------------------------------------------------
+        // Split
+
+
+        template <typename T>
+        __global__ void splitC2_(const uchar* src, size_t src_step, 
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step)
+        {
+            typedef typename TypeTraits<T>::type2 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+            }
+        }
+
+
+        template <typename T>
+        __global__ void splitC3_(const uchar* src, size_t src_step, 
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step)
+        {
+            typedef typename TypeTraits<T>::type3 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);
+
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+            }
+        }
+
+
+        template <>
+        __global__ void splitC3_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src_y = (const double*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);
+
+            if (x < cols && y < rows) 
+            {
+                dst0_y[x] = src_y[3 * x];
+                dst1_y[x] = src_y[3 * x + 1];
+                dst2_y[x] = src_y[3 * x + 2];
+            }
+        }
+
+
+        template <typename T>
+        __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step,
+                                uchar* dst3, size_t dst3_step)
+        {
+            typedef typename TypeTraits<T>::type4 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);
+            T* dst3_y = (T*)(dst3 + y * dst3_step);
+
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+                dst3_y[x] = src_elem.w;
+            }
+        }
+
+
+        template <>
+        __global__ void splitC4_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step,
+                uchar* dst3, size_t dst3_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double2* src_y = (const double2*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);
+            double* dst3_y = (double*)(dst3 + y * dst3_step);
+
+            if (x < cols && y < rows) 
+            {
+                double2 src_elem1 = src_y[2 * x];
+                double2 src_elem2 = src_y[2 * x + 1];
+                dst0_y[x] = src_elem1.x;
+                dst1_y[x] = src_elem1.y;
+                dst2_y[x] = src_elem2.x;
+                dst3_y[x] = src_elem2.y;
+            }
+        }
+
+        template <typename T>
+        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step,
+                    dst[2].data, dst[2].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                     src.data, src.step, src.rows, src.cols,
+                     dst[0].data, dst[0].step,
+                     dst[1].data, dst[1].step,
+                     dst[2].data, dst[2].step,
+                     dst[3].data, dst[3].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        {
+            static SplitFunction split_func_tbl[] =
+            {
+                splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+                splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+                splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
+            };
+
+            size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+            SplitFunction split_func = split_func_tbl[split_func_id];
+
+            if (split_func == 0)
+                cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+            split_func(src, dst, stream);
+        }
+    } // namespace split_merge
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu
index 605ee0b..665af84 100644
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -42,496 +42,494 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobm {
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cv { namespace gpu { namespace device 
+{
+    namespace stereobm 
+    {
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
 
-#define ROWSperTHREAD 21     // the number of rows a thread will process
+        #define ROWSperTHREAD 21     // the number of rows a thread will process
 
-#define BLOCK_W 128          // the thread block width (464)
-#define N_DISPARITIES 8
+        #define BLOCK_W 128          // the thread block width (464)
+        #define N_DISPARITIES 8
 
-#define STEREO_MIND 0                    // The minimum d range to check
-#define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
+        #define STEREO_MIND 0                    // The minimum d range to check
+        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
 
-__constant__ unsigned int* cminSSDImage;
-__constant__ size_t cminSSD_step;
-__constant__ int cwidth;
-__constant__ int cheight;
+        __constant__ unsigned int* cminSSDImage;
+        __constant__ size_t cminSSD_step;
+        __constant__ int cwidth;
+        __constant__ int cheight;
 
-__device__ __forceinline__ int SQ(int a)
-{
-    return a * a;
-}
+        __device__ __forceinline__ int SQ(int a)
+        {
+            return a * a;
+        }
 
-template<int RADIUS>
-__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-{	
-    unsigned int cache = 0;
-    unsigned int cache2 = 0;
+        template<int RADIUS>
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {	
+            unsigned int cache = 0;
+            unsigned int cache2 = 0;
 
-    for(int i = 1; i <= RADIUS; i++)
-        cache += col_ssd[i];
+            for(int i = 1; i <= RADIUS; i++)
+                cache += col_ssd[i];
 
-    col_ssd_cache[0] = cache;
+            col_ssd_cache[0] = cache;
 
-    __syncthreads();
+            __syncthreads();
 
-    if (threadIdx.x < BLOCK_W - RADIUS)
-        cache2 = col_ssd_cache[RADIUS];
-    else
-        for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
-            cache2 += col_ssd[i];
+            if (threadIdx.x < BLOCK_W - RADIUS)
+                cache2 = col_ssd_cache[RADIUS];
+            else
+                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
+                    cache2 += col_ssd[i];
 
-    return col_ssd[0] + cache + cache2;
-}
+            return col_ssd[0] + cache + cache2;
+        }
 
-template<int RADIUS>
-__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-{
-    unsigned int ssd[N_DISPARITIES];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
-
-    int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
-
-    int bestIdx = 0;
-    for (int i = 0; i < N_DISPARITIES; i++)
-    {
-        if (mssd == ssd[i])
-            bestIdx = i;
-    }
+        template<int RADIUS>
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int ssd[N_DISPARITIES];
 
-    return make_uint2(mssd, bestIdx);
-}
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
 
-template<int RADIUS>
-__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-{
-    unsigned char leftPixel1;
-    unsigned char leftPixel2;
-    unsigned char rightPixel1[8];
-    unsigned char rightPixel2[8];
-    unsigned int diff1, diff2;
-
-    leftPixel1 = imageL[idx1];
-    leftPixel2 = imageL[idx2];
-
-    idx1 = idx1 - d;
-    idx2 = idx2 - d;
-
-    rightPixel1[7] = imageR[idx1 - 7];
-    rightPixel1[0] = imageR[idx1 - 0];
-    rightPixel1[1] = imageR[idx1 - 1];
-    rightPixel1[2] = imageR[idx1 - 2];
-    rightPixel1[3] = imageR[idx1 - 3];
-    rightPixel1[4] = imageR[idx1 - 4];
-    rightPixel1[5] = imageR[idx1 - 5];
-    rightPixel1[6] = imageR[idx1 - 6];
-
-    rightPixel2[7] = imageR[idx2 - 7];
-    rightPixel2[0] = imageR[idx2 - 0];
-    rightPixel2[1] = imageR[idx2 - 1];
-    rightPixel2[2] = imageR[idx2 - 2];
-    rightPixel2[3] = imageR[idx2 - 3];
-    rightPixel2[4] = imageR[idx2 - 4];
-    rightPixel2[5] = imageR[idx2 - 5];
-    rightPixel2[6] = imageR[idx2 - 6];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    diff1 = leftPixel1 - rightPixel1[0];
-    diff2 = leftPixel2 - rightPixel2[0];
-    col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[1];
-    diff2 = leftPixel2 - rightPixel2[1];
-    col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[2];
-    diff2 = leftPixel2 - rightPixel2[2];
-    col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[3];
-    diff2 = leftPixel2 - rightPixel2[3];
-    col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[4];
-    diff2 = leftPixel2 - rightPixel2[4];
-    col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[5];
-    diff2 = leftPixel2 - rightPixel2[5];
-    col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[6];
-    diff2 = leftPixel2 - rightPixel2[6];
-    col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[7];
-    diff2 = leftPixel2 - rightPixel2[7];
-    col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-}
-
-template<int RADIUS>
-__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-{
-    unsigned char leftPixel1;
-    int idx;
-    unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
 
-    for(int i = 0; i < (2 * RADIUS + 1); i++)
-    {
-        idx = y_tex * im_pitch + x_tex;
-        leftPixel1 = imageL[idx];
-        idx = idx - d;
-
-        diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
-        diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
-        diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
-        diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
-        diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
-        diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
-        diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
-        diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
-
-        y_tex += 1;
-    }
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
-    col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
-    col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
-    col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
-    col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
-    col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
-    col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
-    col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
-}
-
-template<int RADIUS>
-__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
-{
-    extern __shared__ unsigned int col_ssd_cache[];
-    volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-    volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
-
-    //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-    int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-    //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-    #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-    //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
-
-    unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
-    unsigned char* disparImage = disp.data + X + Y * disp.step;
- /*   if (X < cwidth)
-    {
-        unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-        for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-            *ptr = 0xFFFFFFFF;
-    }*/
-    int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
-    int y_tex;
-    int x_tex = X - RADIUS;
-
-    if (x_tex >= cwidth)
-        return;
-
-    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
-    {
-        y_tex = Y - RADIUS;
-
-        InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
+            int bestIdx = 0;
+            for (int i = 0; i < N_DISPARITIES; i++)
+            {
+                if (mssd == ssd[i])
+                    bestIdx = i;
+            }
 
-        if (col_ssd_extra > 0)
-            if (x_tex + BLOCK_W < cwidth)
-                InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
+            return make_uint2(mssd, bestIdx);
+        }
 
-        __syncthreads(); //before MinSSD function
+        template<int RADIUS>
+        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            unsigned char leftPixel2;
+            unsigned char rightPixel1[8];
+            unsigned char rightPixel2[8];
+            unsigned int diff1, diff2;
+
+            leftPixel1 = imageL[idx1];
+            leftPixel2 = imageL[idx2];
+
+            idx1 = idx1 - d;
+            idx2 = idx2 - d;
+
+            rightPixel1[7] = imageR[idx1 - 7];
+            rightPixel1[0] = imageR[idx1 - 0];
+            rightPixel1[1] = imageR[idx1 - 1];
+            rightPixel1[2] = imageR[idx1 - 2];
+            rightPixel1[3] = imageR[idx1 - 3];
+            rightPixel1[4] = imageR[idx1 - 4];
+            rightPixel1[5] = imageR[idx1 - 5];
+            rightPixel1[6] = imageR[idx1 - 6];
+
+            rightPixel2[7] = imageR[idx2 - 7];
+            rightPixel2[0] = imageR[idx2 - 0];
+            rightPixel2[1] = imageR[idx2 - 1];
+            rightPixel2[2] = imageR[idx2 - 2];
+            rightPixel2[3] = imageR[idx2 - 3];
+            rightPixel2[4] = imageR[idx2 - 4];
+            rightPixel2[5] = imageR[idx2 - 5];
+            rightPixel2[6] = imageR[idx2 - 6];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            diff1 = leftPixel1 - rightPixel1[0];
+            diff2 = leftPixel2 - rightPixel2[0];
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[1];
+            diff2 = leftPixel2 - rightPixel2[1];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[2];
+            diff2 = leftPixel2 - rightPixel2[2];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[3];
+            diff2 = leftPixel2 - rightPixel2[3];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[4];
+            diff2 = leftPixel2 - rightPixel2[4];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[5];
+            diff2 = leftPixel2 - rightPixel2[5];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[6];
+            diff2 = leftPixel2 - rightPixel2[6];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[7];
+            diff2 = leftPixel2 - rightPixel2[7];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+        }
 
-        if (X < cwidth - RADIUS && Y < cheight - RADIUS)
+        template<int RADIUS>
+        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
         {
-            uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-            if (minSSD.x < minSSDImage[0])
+            unsigned char leftPixel1;
+            int idx;
+            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+            for(int i = 0; i < (2 * RADIUS + 1); i++)
             {
-                disparImage[0] = (unsigned char)(d + minSSD.y);
-                minSSDImage[0] = minSSD.x;
+                idx = y_tex * im_pitch + x_tex;
+                leftPixel1 = imageL[idx];
+                idx = idx - d;
+
+                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+                y_tex += 1;
             }
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
         }
 
-        for(int row = 1; row < end_row; row++)
+        template<int RADIUS>
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
         {
-            int idx1 = y_tex * img_step + x_tex;
-            int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
-
-            __syncthreads();
-
-            StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
+            extern __shared__ unsigned int col_ssd_cache[];
+            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+
+            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
+            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
+            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
+            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+
+            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+            unsigned char* disparImage = disp.data + X + Y * disp.step;
+         /*   if (X < cwidth)
+            {
+                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
+                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
+                    *ptr = 0xFFFFFFFF;
+            }*/
+            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
+            int y_tex;
+            int x_tex = X - RADIUS;
+
+            if (x_tex >= cwidth)
+                return;
+
+            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+            {
+                y_tex = Y - RADIUS;
 
-            if (col_ssd_extra)
-                if (x_tex + BLOCK_W < cwidth)
-                    StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
+                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
 
-            y_tex += 1;
+                if (col_ssd_extra > 0)
+                    if (x_tex + BLOCK_W < cwidth)
+                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
 
-            __syncthreads(); //before MinSSD function
+                __syncthreads(); //before MinSSD function
 
-            if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
-            {
-                int idx = row * cminSSD_step;
-                uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                if (minSSD.x < minSSDImage[idx])
+                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
                 {
-                    disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                    minSSDImage[idx] = minSSD.x;
+                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                    if (minSSD.x < minSSDImage[0])
+                    {
+                        disparImage[0] = (unsigned char)(d + minSSD.y);
+                        minSSDImage[0] = minSSD.x;
+                    }
                 }
-            }
-        } // for row loop
-    } // for d loop
-}
 
+                for(int row = 1; row < end_row; row++)
+                {
+                    int idx1 = y_tex * img_step + x_tex;
+                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
 
-template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
-{
-    dim3 grid(1,1,1);
-    dim3 threads(BLOCK_W, 1, 1);
+                    __syncthreads();
 
-    grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
-    grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
+                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
 
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
+                    if (col_ssd_extra)
+                        if (x_tex + BLOCK_W < cwidth)
+                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
 
-    stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
-    cudaSafeCall( cudaGetLastError() );
+                    y_tex += 1;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-};
+                    __syncthreads(); //before MinSSD function
 
-typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
+                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+                    {
+                        int idx = row * cminSSD_step;
+                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                        if (minSSD.x < minSSDImage[idx])
+                        {
+                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
+                            minSSDImage[idx] = minSSD.x;
+                        }
+                    }
+                } // for row loop
+            } // for d loop
+        }
 
-const static kernel_caller_t callers[] =
-{
-    0,
-    kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
-    kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-    kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
-    kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
-    kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-    //0,0,0, 0,0,0, 0,0,kernel_caller<9>
-};
-const int calles_num = sizeof(callers)/sizeof(callers[0]);
-
-void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
-{
-    int winsz2 = winsz >> 1;
 
-    if (winsz2 == 0 || winsz2 >= calles_num)
-        cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
+        template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
+        {
+            dim3 grid(1,1,1);
+            dim3 threads(BLOCK_W, 1, 1);
 
-    //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-    //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
+            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
+            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
 
-    cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-    cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
 
-    cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-    cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-    cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            cudaSafeCall( cudaGetLastError() );
 
-    size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-    cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        };
 
-    callers[winsz2](left, right, disp, maxdisp, stream);
-}
+        typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
+            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
+            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
+
+            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
+        };
+        const int calles_num = sizeof(callers)/sizeof(callers[0]);
+
+        void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        {
+            int winsz2 = winsz >> 1;
 
-texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
+            if (winsz2 == 0 || winsz2 >= calles_num)
+                cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
 
-__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
-{
-    int x = blockDim.x * blockIdx.x + threadIdx.x;
-    int y = blockDim.y * blockIdx.y + threadIdx.y;
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
 
-    if (x < output.cols && y < output.rows)
-    {
-        int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
-                   (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
-                   (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
+            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
+            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
 
+            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
 
-        conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
-        output.ptr(y)[x] = conv & 0xFF;
-    }
-}
+            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
 
-void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
-{
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
+            callers[winsz2](left, right, disp, maxdisp, stream);
+        }
 
-    dim3 threads(16, 16, 1);
-    dim3 grid(1, 1, 1);
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
 
-    grid.x = divUp(input.cols, threads.x);
-    grid.y = divUp(input.rows, threads.y);
+        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
 
-    prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
-    cudaSafeCall( cudaGetLastError() );
+        __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
+        {
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-    if (stream == 0)   
-        cudaSafeCall( cudaDeviceSynchronize() );    
+            if (x < output.cols && y < output.rows)
+            {
+                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
+                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
+                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
 
-    cudaSafeCall( cudaUnbindTexture (texForSobel ) );
-}
 
+                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
+                output.ptr(y)[x] = conv & 0xFF;
+            }
+        }
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// Textureness filtering ////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
+        void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
 
-texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
 
-__device__ __forceinline__ float sobel(int x, int y)
-{
-    float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                 tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                 tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
-    return fabs(conv);
-}
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, threads.y);
 
-__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
-{
-    float cache = 0;
-    float cache2 = 0;
-    int winsz2 = winsz/2;
+            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+            cudaSafeCall( cudaGetLastError() );
 
-    for(int i = 1; i <= winsz2; i++)
-        cache += cols[i];
+            if (stream == 0)   
+                cudaSafeCall( cudaDeviceSynchronize() );    
 
-    cols_cache[0] = cache;
+            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
+        }
 
-    __syncthreads();
 
-    if (threadIdx.x < blockDim.x - winsz2)
-        cache2 = cols_cache[winsz2];
-    else
-        for(int i = winsz2 + 1; i < winsz; i++)
-            cache2 += cols[i];
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
 
-    return cols[0] + cache + cache2;
-}
+        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
 
-#define RpT (2 * ROWSperTHREAD)  // got experimentally
+        __device__ __forceinline__ float sobel(int x, int y)
+        {
+            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
+                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
+                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            return fabs(conv);
+        }
 
-__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
-{
-    int winsz2 = winsz/2;
-    int n_dirty_pixels = (winsz2) * 2;
+        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
+        {
+            float cache = 0;
+            float cache2 = 0;
+            int winsz2 = winsz/2;
 
-    extern __shared__ float cols_cache[];
-    float *cols = cols_cache + blockDim.x + threadIdx.x;
-    float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
+            for(int i = 1; i <= winsz2; i++)
+                cache += cols[i];
 
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int beg_row = blockIdx.y * RpT;
-    int end_row = ::min(beg_row + RpT, disp.rows);
+            cols_cache[0] = cache;
 
-    if (x < disp.cols)
-    {
-        int y = beg_row;
+            __syncthreads();
 
-        float sum = 0;
-        float sum_extra = 0;
+            if (threadIdx.x < blockDim.x - winsz2)
+                cache2 = cols_cache[winsz2];
+            else
+                for(int i = winsz2 + 1; i < winsz; i++)
+                    cache2 += cols[i];
 
-        for(int i = y - winsz2; i <= y + winsz2; ++i)
-        {
-            sum += sobel(x - winsz2, i);
-            if (cols_extra)
-                sum_extra += sobel(x + blockDim.x - winsz2, i);
+            return cols[0] + cache + cache2;
         }
-        *cols = sum;
-        if (cols_extra)
-            *cols_extra = sum_extra;
 
-        __syncthreads();
+        #define RpT (2 * ROWSperTHREAD)  // got experimentally
 
-        float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-        if (sum_win < threshold)
-            disp.data[y * disp.step + x] = 0;
+        __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
+        {
+            int winsz2 = winsz/2;
+            int n_dirty_pixels = (winsz2) * 2;
 
-        __syncthreads();
+            extern __shared__ float cols_cache[];
+            float *cols = cols_cache + blockDim.x + threadIdx.x;
+            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
 
-        for(int y = beg_row + 1; y < end_row; ++y)
-        {
-            sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
-            *cols = sum;
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int beg_row = blockIdx.y * RpT;
+            int end_row = ::min(beg_row + RpT, disp.rows);
 
-            if (cols_extra)
+            if (x < disp.cols)
             {
-                sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
-                *cols_extra = sum_extra;
-            }
+                int y = beg_row;
 
-            __syncthreads();
-            float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-            if (sum_win < threshold)
-                disp.data[y * disp.step + x] = 0;
+                float sum = 0;
+                float sum_extra = 0;
 
-            __syncthreads();
-        }
-    }
-}
+                for(int i = y - winsz2; i <= y + winsz2; ++i)
+                {
+                    sum += sobel(x - winsz2, i);
+                    if (cols_extra)
+                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                }
+                *cols = sum;
+                if (cols_extra)
+                    *cols_extra = sum_extra;
 
-void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
-{
-    avgTexturenessThreshold *= winsz * winsz;
+                __syncthreads();
 
-    texForTF.filterMode     = cudaFilterModeLinear;
-    texForTF.addressMode[0] = cudaAddressModeWrap;
-    texForTF.addressMode[1] = cudaAddressModeWrap;
+                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                if (sum_win < threshold)
+                    disp.data[y * disp.step + x] = 0;
 
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
+                __syncthreads();
 
-    dim3 threads(128, 1, 1);
-    dim3 grid(1, 1, 1);
+                for(int y = beg_row + 1; y < end_row; ++y)
+                {
+                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    *cols = sum;
 
-    grid.x = divUp(input.cols, threads.x);
-    grid.y = divUp(input.rows, RpT);
+                    if (cols_extra)
+                    {
+                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        *cols_extra = sum_extra;
+                    }
 
-    size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-    textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
-    cudaSafeCall( cudaGetLastError() );
+                    __syncthreads();
+                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                    if (sum_win < threshold)
+                        disp.data[y * disp.step + x] = 0;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+                    __syncthreads();
+                }
+            }
+        }
+
+        void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
+        {
+            avgTexturenessThreshold *= winsz * winsz;
 
-    cudaSafeCall( cudaUnbindTexture (texForTF) );
-}
+            texForTF.filterMode     = cudaFilterModeLinear;
+            texForTF.addressMode[0] = cudaAddressModeWrap;
+            texForTF.addressMode[1] = cudaAddressModeWrap;
 
-} // namespace stereobm
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
 
-END_OPENCV_DEVICE_NAMESPACE
+            dim3 threads(128, 1, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, RpT);
+
+            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForTF) );
+        }
+    } // namespace stereobm
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index ab626ff..2e57157 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -44,489 +44,487 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobp {
-
-///////////////////////////////////////////////////////////////
-/////////////////////// load constants ////////////////////////
-///////////////////////////////////////////////////////////////
-
-__constant__ int   cndisp;
-__constant__ float cmax_data_term;
-__constant__ float cdata_weight;
-__constant__ float cmax_disc_term;
-__constant__ float cdisc_single_jump;
-
-void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-}
-
-///////////////////////////////////////////////////////////////
-////////////////////////// comp data //////////////////////////
-///////////////////////////////////////////////////////////////
-
-template <int cn> struct PixDiff;
-template <> struct PixDiff<1>
-{
-    __device__ __forceinline__ PixDiff(const uchar* ls)
-    {
-        l = *ls;
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        return ::abs((int)l - *rs);
-    }
-    uchar l;
-};
-template <> struct PixDiff<3>
-{
-    __device__ __forceinline__ PixDiff(const uchar* ls)
-    {
-        l = *((uchar3*)ls);
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        const float tr = 0.299f;
-        const float tg = 0.587f;
-        const float tb = 0.114f;
-
-        float val  = tb * ::abs((int)l.x - rs[0]);
-              val += tg * ::abs((int)l.y - rs[1]);
-              val += tr * ::abs((int)l.z - rs[2]);
-
-        return val;
-    }
-    uchar3 l;
-};
-template <> struct PixDiff<4>
+namespace cv { namespace gpu { namespace device 
 {
-    __device__ __forceinline__ PixDiff(const uchar* ls)
+    namespace stereobp 
     {
-        l = *((uchar4*)ls);
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        const float tr = 0.299f;
-        const float tg = 0.587f;
-        const float tb = 0.114f;
-
-        uchar4 r = *((uchar4*)rs);
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-        float val  = tb * ::abs((int)l.x - r.x);
-              val += tg * ::abs((int)l.y - r.y);
-              val += tr * ::abs((int)l.z - r.z);
+        __constant__ int   cndisp;
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;
 
-        return val;
-    }
-    uchar4 l;
-};
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+        }
 
-template <int cn, typename D>
-__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        ///////////////////////////////////////////////////////////////
+        ////////////////////////// comp data //////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-    if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
-    {
-        const uchar* ls = left.ptr(y) + x * cn;
-        const PixDiff<cn> pixDiff(ls);
-        const uchar* rs = right.ptr(y) + x * cn;
+        template <int cn> struct PixDiff;
+        template <> struct PixDiff<1>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *ls;
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                return ::abs((int)l - *rs);
+            }
+            uchar l;
+        };
+        template <> struct PixDiff<3>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar3*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
 
-        D* ds = data.ptr(y) + x;
-        const size_t disp_step = data.step * left.rows;
+                float val  = tb * ::abs((int)l.x - rs[0]);
+                      val += tg * ::abs((int)l.y - rs[1]);
+                      val += tr * ::abs((int)l.z - rs[2]);
 
-        for (int disp = 0; disp < cndisp; disp++)
+                return val;
+            }
+            uchar3 l;
+        };
+        template <> struct PixDiff<4>
         {
-            if (x - disp >= 1)
+            __device__ __forceinline__ PixDiff(const uchar* ls)
             {
-                float val = pixDiff(rs - disp * cn);
+                l = *((uchar4*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                uchar4 r = *((uchar4*)rs);
 
-                ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+                float val  = tb * ::abs((int)l.x - r.x);
+                      val += tg * ::abs((int)l.y - r.y);
+                      val += tr * ::abs((int)l.z - r.z);
+
+                return val;
             }
-            else
+            uchar4 l;
+        };
+
+        template <int cn, typename D>
+        __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
             {
-                ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                const uchar* ls = left.ptr(y) + x * cn;
+                const PixDiff<cn> pixDiff(ls);
+                const uchar* rs = right.ptr(y) + x * cn;
+
+                D* ds = data.ptr(y) + x;
+                const size_t disp_step = data.step * left.rows;
+
+                for (int disp = 0; disp < cndisp; disp++)
+                {
+                    if (x - disp >= 1)
+                    {
+                        float val = pixDiff(rs - disp * cn);
+
+                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+                    }
+                    else
+                    {
+                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                    }
+                }
             }
         }
-    }
-}
-
-template<typename T, typename D>
-void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
 
-template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template<typename T, typename D>
+        void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+        template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+        template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+        template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );
 
-///////////////////////////////////////////////////////////////
-//////////////////////// data step down ///////////////////////
-///////////////////////////////////////////////////////////////
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template <typename T>
-__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// data step down ///////////////////////
+        ///////////////////////////////////////////////////////////////
 
-    if (x < dst_cols && y < dst_rows)
-    {
-        for (int d = 0; d < cndisp; ++d)
+        template <typename T>
+        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
         {
-            float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+            if (x < dst_cols && y < dst_rows)
+            {
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+
+                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+                }
+            }
         }
-    }
-}
 
-template<typename T>
-void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    grid.x = divUp(dst_cols, threads.x);
-    grid.y = divUp(dst_rows, threads.y);
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
 
-    data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
-    cudaSafeCall( cudaGetLastError() );
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
 
-///////////////////////////////////////////////////////////////
-/////////////////// level up messages  ////////////////////////
-///////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////////////////////////
+        /////////////////// level up messages  ////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-template <typename T>
-__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template <typename T>
+        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (x < dst_cols && y < dst_rows)
-    {
-        const size_t dst_disp_step = dst.step * dst_rows;
-        const size_t src_disp_step = src.step * src_rows;
+            if (x < dst_cols && y < dst_rows)
+            {
+                const size_t dst_disp_step = dst.step * dst_rows;
+                const size_t src_disp_step = src.step * src_rows;
 
-        T*       dstr = dst.ptr(y  ) + x;
-        const T* srcr = src.ptr(y/2) + x/2;
+                T*       dstr = dst.ptr(y  ) + x;
+                const T* srcr = src.ptr(y/2) + x/2;
 
-        for (int d = 0; d < cndisp; ++d)
-            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-    }
-}
+                for (int d = 0; d < cndisp; ++d)
+                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+            }
+        }
 
-template <typename T>
-void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    grid.x = divUp(dst_cols, threads.x);
-    grid.y = divUp(dst_rows, threads.y);
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
 
-    int src_idx = (dst_idx + 1) & 1;
+            int src_idx = (dst_idx + 1) & 1;
 
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
 
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
 
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
 
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
 
-///////////////////////////////////////////////////////////////
-////////////////////  calc all iterations /////////////////////
-///////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
 
-template <typename T>
-__device__ void calc_min_linear_penalty(T* dst, size_t step)
-{
-    float prev = dst[0];
-    float cur;
-    for (int disp = 1; disp < cndisp; ++disp)
-    {
-        prev += cdisc_single_jump;
-        cur = dst[step * disp];
-        if (prev < cur)
+        template <typename T>
+        __device__ void calc_min_linear_penalty(T* dst, size_t step)
         {
-            cur = prev;
-            dst[step * disp] = saturate_cast<T>(prev);
+            float prev = dst[0];
+            float cur;
+            for (int disp = 1; disp < cndisp; ++disp)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+
+            prev = dst[(cndisp - 1) * step];
+            for (int disp = cndisp - 2; disp >= 0; disp--)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
         }
-        prev = cur;
-    }
 
-    prev = dst[(cndisp - 1) * step];
-    for (int disp = cndisp - 2; disp >= 0; disp--)
-    {
-        prev += cdisc_single_jump;
-        cur = dst[step * disp];
-        if (prev < cur)
+        template <typename T>
+        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
         {
-            cur = prev;
-            dst[step * disp] = saturate_cast<T>(prev);
-        }
-        prev = cur;
-    }
-}
+            float minimum = device::numeric_limits<float>::max();
 
-template <typename T>
-__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
-{
-    float minimum = device::numeric_limits<float>::max();
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg  = msg1[msg_disp_step * i];
+                      dst_reg += msg2[msg_disp_step * i];
+                      dst_reg += msg3[msg_disp_step * i];
+                      dst_reg += data[data_disp_step * i];
 
-    for(int i = 0; i < cndisp; ++i)
-    {
-        float dst_reg  = msg1[msg_disp_step * i];
-              dst_reg += msg2[msg_disp_step * i];
-              dst_reg += msg3[msg_disp_step * i];
-              dst_reg += data[data_disp_step * i];
+                if (dst_reg < minimum)
+                    minimum = dst_reg;
 
-        if (dst_reg < minimum)
-            minimum = dst_reg;
+                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
+            }
 
-        dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
-    }
+            calc_min_linear_penalty(dst, msg_disp_step);
 
-    calc_min_linear_penalty(dst, msg_disp_step);
+            minimum += cmax_disc_term;
 
-    minimum += cmax_disc_term;
+            float sum = 0;
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg = dst[msg_disp_step * i];
+                if (dst_reg > minimum)
+                {
+                    dst_reg = minimum;
+                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
+                }
+                sum += dst_reg;
+            }
+            sum /= cndisp;
 
-    float sum = 0;
-    for(int i = 0; i < cndisp; ++i)
-    {
-        float dst_reg = dst[msg_disp_step * i];
-        if (dst_reg > minimum)
-        {
-            dst_reg = minimum;
-            dst[msg_disp_step * i] = saturate_cast<T>(minimum);
+            for(int i = 0; i < cndisp; ++i)
+                dst[msg_disp_step * i] -= sum;
         }
-        sum += dst_reg;
-    }
-    sum /= cndisp;
-
-    for(int i = 0; i < cndisp; ++i)
-        dst[msg_disp_step * i] -= sum;
-}
-
-template <typename T>
-__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
 
-    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
-    {
-        T* us = u.ptr(y) + x;
-        T* ds = d + y * u.step + x;
-        T* ls = l + y * u.step + x;
-        T* rs = r + y * u.step + x;
-        const T* dt = data.ptr(y) + x;
-
-        size_t msg_disp_step = u.step * rows;
-        size_t data_disp_step = data.step * rows;
-
-        message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-        message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-        message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-        message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
-    }
-}
-
-template <typename T>
-void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
-    const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(cols, threads.x << 1);
-    grid.y = divUp(rows, threads.y);
+        template <typename T>
+        __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
 
-    for(int t = 0; t < iters; ++t)
-    {
-        one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
-        cudaSafeCall( cudaGetLastError() );
+            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+            {
+                T* us = u.ptr(y) + x;
+                T* ds = d + y * u.step + x;
+                T* ls = l + y * u.step + x;
+                T* rs = r + y * u.step + x;
+                const T* dt = data.ptr(y) + x;
+
+                size_t msg_disp_step = u.step * rows;
+                size_t data_disp_step = data.step * rows;
+
+                message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+                message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+                message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+                message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+            }
+        }
 
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
+            const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+            grid.x = divUp(cols, threads.x << 1);
+            grid.y = divUp(rows, threads.y);
 
-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
+            for(int t = 0; t < iters; ++t)
+            {
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
+                cudaSafeCall( cudaGetLastError() );
 
-template <typename T>
-__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
-    DevMem2D_<short> disp)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
 
-    if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-    {
-        const T* us = u.ptr(y + 1) + x;
-        const T* ds = d + (y - 1) * u.step + x;
-        const T* ls = l + y * u.step + (x + 1);
-        const T* rs = r + y * u.step + (x - 1);
-        const T* dt = data + y * u.step + x;
+        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
 
-        size_t disp_step = disp.rows * u.step;
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-        int best = 0;
-        float best_val = numeric_limits<float>::max();
-        for (int d = 0; d < cndisp; ++d)
+        template <typename T>
+        __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
+            DevMem2D_<short> disp)
         {
-            float val  = us[d * disp_step];
-                  val += ds[d * disp_step];
-                  val += ls[d * disp_step];
-                  val += rs[d * disp_step];
-                  val += dt[d * disp_step];
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            if (val < best_val)
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
             {
-                best_val = val;
-                best = d;
+                const T* us = u.ptr(y + 1) + x;
+                const T* ds = d + (y - 1) * u.step + x;
+                const T* ls = l + y * u.step + (x + 1);
+                const T* rs = r + y * u.step + (x - 1);
+                const T* dt = data + y * u.step + x;
+
+                size_t disp_step = disp.rows * u.step;
+
+                int best = 0;
+                float best_val = numeric_limits<float>::max();
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float val  = us[d * disp_step];
+                          val += ds[d * disp_step];
+                          val += ls[d * disp_step];
+                          val += rs[d * disp_step];
+                          val += dt[d * disp_step];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = d;
+                    }
+                }
+
+                disp.ptr(y)[x] = saturate_cast<short>(best);
             }
         }
 
-        disp.ptr(y)[x] = saturate_cast<short>(best);
-    }
-}
-
-template <typename T>
-void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
-    const DevMem2D_<short>& disp, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(disp.cols, threads.x);
-    grid.y = divUp(disp.rows, threads.y);
-
-    output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
+            const DevMem2D_<short>& disp, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
 
-template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
-template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+            output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            cudaSafeCall( cudaGetLastError() );
 
-} // namespace stereobp
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+        template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+    } // namespace stereobp
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu
index bb8e713..cfbcd82 100644
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -44,852 +44,850 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereocsbp {
-
-///////////////////////////////////////////////////////////////
-/////////////////////// load constants ////////////////////////
-///////////////////////////////////////////////////////////////
-
-__constant__ int cndisp;
+namespace cv { namespace gpu { namespace device 
+{
+    namespace stereocsbp 
+    {
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-__constant__ float cmax_data_term;
-__constant__ float cdata_weight;
-__constant__ float cmax_disc_term;
-__constant__ float cdisc_single_jump;
+        __constant__ int cndisp;
 
-__constant__ int cth;
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;
 
-__constant__ size_t cimg_step;
-__constant__ size_t cmsg_step1;
-__constant__ size_t cmsg_step2;
-__constant__ size_t cdisp_step1;
-__constant__ size_t cdisp_step2;
+        __constant__ int cth;
 
-__constant__ uchar* cleft;
-__constant__ uchar* cright;
-__constant__ uchar* ctemp;
+        __constant__ size_t cimg_step;
+        __constant__ size_t cmsg_step1;
+        __constant__ size_t cmsg_step2;
+        __constant__ size_t cdisp_step1;
+        __constant__ size_t cdisp_step2;
 
+        __constant__ uchar* cleft;
+        __constant__ uchar* cright;
+        __constant__ uchar* ctemp;
 
-void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-                    const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
 
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
+                            const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
 
-    cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
 
-    cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
 
-    cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
-    cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
-}
+            cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
 
-///////////////////////////////////////////////////////////////
-/////////////////////// init data cost ////////////////////////
-///////////////////////////////////////////////////////////////
+            cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
+        }
 
-template <int channels> struct DataCostPerPixel;
-template <> struct DataCostPerPixel<1>
-{
-    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-    {
-        return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
-    }
-};
-template <> struct DataCostPerPixel<3>
-{
-    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-    {
-        float tb = 0.114f * ::abs((int)left[0] - right[0]);
-        float tg = 0.587f * ::abs((int)left[1] - right[1]);
-        float tr = 0.299f * ::abs((int)left[2] - right[2]);
-
-        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-    }
-};
-template <> struct DataCostPerPixel<4>
-{
-    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-    {
-        uchar4 l = *((const uchar4*)left);
-        uchar4 r = *((const uchar4*)right);
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// init data cost ////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-        float tb = 0.114f * ::abs((int)l.x - r.x);
-        float tg = 0.587f * ::abs((int)l.y - r.y);
-        float tr = 0.299f * ::abs((int)l.z - r.z);
+        template <int channels> struct DataCostPerPixel;
+        template <> struct DataCostPerPixel<1>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
+            }
+        };
+        template <> struct DataCostPerPixel<3>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                float tb = 0.114f * ::abs((int)left[0] - right[0]);
+                float tg = 0.587f * ::abs((int)left[1] - right[1]);
+                float tr = 0.299f * ::abs((int)left[2] - right[2]);
 
-        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-    }
-};
+                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
+            }
+        };
+        template <> struct DataCostPerPixel<4>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                uchar4 l = *((const uchar4*)left);
+                uchar4 r = *((const uchar4*)right);
 
-template <typename T>
-__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+                float tb = 0.114f * ::abs((int)l.x - r.x);
+                float tg = 0.587f * ::abs((int)l.y - r.y);
+                float tr = 0.299f * ::abs((int)l.z - r.z);
 
-    if (y < h && x < w)
-    {
-        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
-        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
+            }
+        };
 
-        for(int i = 0; i < nr_plane; i++)
+        template <typename T>
+        __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
         {
-            T minimum = device::numeric_limits<T>::max();
-            int id = 0;
-            for(int d = 0; d < cndisp; d++)
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
             {
-                T cur = data_cost[d * cdisp_step1];
-                if(cur < minimum)
+                T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
+                T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+
+                for(int i = 0; i < nr_plane; i++)
                 {
-                    minimum = cur;
-                    id = d;
+                    T minimum = device::numeric_limits<T>::max();
+                    int id = 0;
+                    for(int d = 0; d < cndisp; d++)
+                    {
+                        T cur = data_cost[d * cdisp_step1];
+                        if(cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+
+                    data_cost_selected[i  * cdisp_step1] = minimum;
+                    selected_disparity[i  * cdisp_step1] = id;
+                    data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
                 }
             }
-
-            data_cost_selected[i  * cdisp_step1] = minimum;
-            selected_disparity[i  * cdisp_step1] = id;
-            data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
         }
-    }
-}
 
 
-template <typename T>
-__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template <typename T>
+        __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (y < h && x < w)
-    {
-        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
-        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+            if (y < h && x < w)
+            {
+                T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
+                T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
 
-        int nr_local_minimum = 0;
+                int nr_local_minimum = 0;
 
-        T prev = data_cost[0 * cdisp_step1];
-        T cur  = data_cost[1 * cdisp_step1];
-        T next = data_cost[2 * cdisp_step1];
+                T prev = data_cost[0 * cdisp_step1];
+                T cur  = data_cost[1 * cdisp_step1];
+                T next = data_cost[2 * cdisp_step1];
 
-        for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
-        {
-            if (cur < prev && cur < next)
-            {
-                data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                selected_disparity[nr_local_minimum * cdisp_step1] = d;
+                for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
+                {
+                    if (cur < prev && cur < next)
+                    {
+                        data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
+                        selected_disparity[nr_local_minimum * cdisp_step1] = d;
+
+                        data_cost[d * cdisp_step1] = numeric_limits<T>::max();
+
+                        nr_local_minimum++;
+                    }
+                    prev = cur;
+                    cur = next;
+                    next = data_cost[(d + 1) * cdisp_step1];
+                }
+
+                for (int i = nr_local_minimum; i < nr_plane; i++)
+                {
+                    T minimum = numeric_limits<T>::max();
+                    int id = 0;
 
-                data_cost[d * cdisp_step1] = numeric_limits<T>::max();
+                    for (int d = 0; d < cndisp; d++)
+                    {
+                        cur = data_cost[d * cdisp_step1];
+                        if (cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+                    data_cost_selected[i * cdisp_step1] = minimum;
+                    selected_disparity[i * cdisp_step1] = id;
 
-                nr_local_minimum++;
+                    data_cost[id * cdisp_step1] = numeric_limits<T>::max();
+                }
             }
-            prev = cur;
-            cur = next;
-            next = data_cost[(d + 1) * cdisp_step1];
         }
 
-        for (int i = nr_local_minimum; i < nr_plane; i++)
+        template <typename T, int channels>
+        __global__ void init_data_cost(int h, int w, int level)
         {
-            T minimum = numeric_limits<T>::max();
-            int id = 0;
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            for (int d = 0; d < cndisp; d++)
+            if (y < h && x < w)
             {
-                cur = data_cost[d * cdisp_step1];
-                if (cur < minimum)
+                int y0 = y << level;
+                int yt = (y + 1) << level;
+
+                int x0 = x << level;
+                int xt = (x + 1) << level;
+
+                T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+
+                for(int d = 0; d < cndisp; ++d)
                 {
-                    minimum = cur;
-                    id = d;
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int xr = xi - d;
+                            if(d < cth || xr < 0)
+                                val += cdata_weight * cmax_data_term;
+                            else
+                            {
+                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
+                                const uchar* lri = cright + yi * cimg_step + xr * channels;
+
+                                val += DataCostPerPixel<channels>::compute(lle, lri);
+                            }
+                        }
+                    }
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
                 }
             }
-            data_cost_selected[i * cdisp_step1] = minimum;
-            selected_disparity[i * cdisp_step1] = id;
-
-            data_cost[id * cdisp_step1] = numeric_limits<T>::max();
         }
-    }
-}
 
-template <typename T, int channels>
-__global__ void init_data_cost(int h, int w, int level)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template <typename T, int winsz, int channels>
+        __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
 
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
+            int tid = threadIdx.x;
 
-        int x0 = x << level;
-        int xt = (x + 1) << level;
+            if (d < cndisp)
+            {
+                int x0 = x_out << level;
+                int y0 = y_out << level;
 
-        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+                int len = ::min(y0 + winsz, rows) - y0;
 
-        for(int d = 0; d < cndisp; ++d)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
+                float val = 0.0f;
+                if (x0 + tid < cols)
                 {
-                    int xr = xi - d;
-                    if(d < cth || xr < 0)
-                        val += cdata_weight * cmax_data_term;
+                    if (x0 + tid - d < 0 || d < cth)
+                        val = cdata_weight * cmax_data_term * len;
                     else
                     {
-                        const uchar* lle = cleft + yi * cimg_step + xi * channels;
-                        const uchar* lri = cright + yi * cimg_step + xr * channels;
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += DataCostPerPixel<channels>::compute(lle, lri);
 
-                        val += DataCostPerPixel<channels>::compute(lle, lri);
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
                     }
                 }
-            }
-            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-        }
-    }
-}
 
-template <typename T, int winsz, int channels>
-__global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
-{
-    int x_out = blockIdx.x;
-    int y_out = blockIdx.y % h;
-    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+                extern __shared__ float smem[];
+                float* dline = smem + winsz * threadIdx.z;
 
-    int tid = threadIdx.x;
+                dline[tid] = val;
 
-    if (d < cndisp)
-    {
-        int x0 = x_out << level;
-        int y0 = y_out << level;
+                __syncthreads();
 
-        int len = ::min(y0 + winsz, rows) - y0;
+                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
+                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
 
-        float val = 0.0f;
-        if (x0 + tid < cols)
-        {
-            if (x0 + tid - d < 0 || d < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
+		        volatile float* vdline = smem + winsz * threadIdx.z;
 
-                for(int y = 0; y < len; ++y)
-                {
-                    val += DataCostPerPixel<channels>::compute(lle, lri);
+                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
+                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
+                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
+                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
+                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
+                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
 
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
+                T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;
+
+                if (tid == 0)
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
             }
         }
 
-        extern __shared__ float smem[];
-        float* dline = smem + winsz * threadIdx.z;
 
-        dline[tid] = val;
+        template <typename T>
+        void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-        __syncthreads();
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
 
-        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
+            switch (channels)
+            {
+            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
+            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
+            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
+            default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+            }
+        }
 
-		volatile float* vdline = smem + winsz * threadIdx.z;
+        template <typename T, int winsz>
+        void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
 
-        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(ndisp, threads.z);
 
-        T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;
+            switch (channels)
+            {
+            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+            }
+        }
 
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
-    }
-}
+        template<class T>
+        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
+        {
 
+            typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
 
-template <typename T>
-void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            static const InitDataCostCaller init_data_cost_callers[] =
+            {
+                init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
+                init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
+                init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
+            };
 
-    grid.x = divUp(w, threads.x);
-    grid.y = divUp(h, threads.y);
+            size_t disp_step = msg_step * h;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
 
-    switch (channels)
-    {
-    case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
-    case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
-    case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
-    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }
-}
-
-template <typename T, int winsz>
-void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
-{
-    const int threadsNum = 256;
-    const size_t smem_size = threadsNum * sizeof(float);
+            init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
+            cudaSafeCall( cudaGetLastError() );
 
-    dim3 threads(winsz, 1, threadsNum / winsz);
-    dim3 grid(w, h, 1);
-    grid.y *= divUp(ndisp, threads.z);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-    switch (channels)
-    {
-    case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-    case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-    case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }
-}
-
-template<class T>
-void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
-{
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
 
-    static const InitDataCostCaller init_data_cost_callers[] =
-    {
-        init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
-        init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
-        init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
-    };
+            if (use_local_init_data_cost == true)
+                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+            else
+                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+            
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    size_t disp_step = msg_step * h;
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
+        template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
 
-    init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
-    cudaSafeCall( cudaGetLastError() );
+        template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+        ///////////////////////////////////////////////////////////////
+        ////////////////////// compute data cost //////////////////////
+        ///////////////////////////////////////////////////////////////
 
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template <typename T, int channels>
+        __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    grid.x = divUp(w, threads.x);
-    grid.y = divUp(h, threads.y);
+            if (y < h && x < w)
+            {
+                int y0 = y << level;
+                int yt = (y + 1) << level;
 
-    if (use_local_init_data_cost == true)
-        get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-    else
-        get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-    
-    cudaSafeCall( cudaGetLastError() );
+                int x0 = x << level;
+                int xt = (x + 1) << level;
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+                const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
+                T* data_cost = data_cost_ + y * cmsg_step1 + x;
 
-template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
-            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+                for(int d = 0; d < nr_plane; d++)
+                {
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int sel_disp = selected_disparity[d * cdisp_step2];
+                            int xr = xi - sel_disp;
+
+                            if (xr < 0 || sel_disp < cth)
+                                val += cdata_weight * cmax_data_term;
+                            else
+                            {
+                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
+                                const uchar* right_x = cright + yi * cimg_step + xr * channels;
+
+                                val += DataCostPerPixel<channels>::compute(left_x, right_x);
+                            }
+                        }
+                    }
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                }
+            }
+        }
 
-template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
-            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+        template <typename T, int winsz, int channels>
+        __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
 
-///////////////////////////////////////////////////////////////
-////////////////////// compute data cost //////////////////////
-///////////////////////////////////////////////////////////////
+            int tid = threadIdx.x;
 
-template <typename T, int channels>
-__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
+            T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
 
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
+            if (d < nr_plane)
+            {
+                int sel_disp = selected_disparity[d * cdisp_step2];
 
-        int x0 = x << level;
-        int xt = (x + 1) << level;
+                int x0 = x_out << level;
+                int y0 = y_out << level;
 
-        const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
-        T* data_cost = data_cost_ + y * cmsg_step1 + x;
+                int len = ::min(y0 + winsz, rows) - y0;
 
-        for(int d = 0; d < nr_plane; d++)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
+                float val = 0.0f;
+                if (x0 + tid < cols)
                 {
-                    int sel_disp = selected_disparity[d * cdisp_step2];
-                    int xr = xi - sel_disp;
-
-                    if (xr < 0 || sel_disp < cth)
-                        val += cdata_weight * cmax_data_term;
+                    if (x0 + tid - sel_disp < 0 || sel_disp < cth)
+                        val = cdata_weight * cmax_data_term * len;
                     else
                     {
-                        const uchar* left_x = cleft + yi * cimg_step + xi * channels;
-                        const uchar* right_x = cright + yi * cimg_step + xr * channels;
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += DataCostPerPixel<channels>::compute(lle, lri);
 
-                        val += DataCostPerPixel<channels>::compute(left_x, right_x);
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
                     }
                 }
-            }
-            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-        }
-    }
-}
 
-template <typename T, int winsz, int channels>
-__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
-{
-    int x_out = blockIdx.x;
-    int y_out = blockIdx.y % h;
-    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+                extern __shared__ float smem[];
+                float* dline = smem + winsz * threadIdx.z;
 
-    int tid = threadIdx.x;
+                dline[tid] = val;
 
-    const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
-    T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
+                __syncthreads();
 
-    if (d < nr_plane)
-    {
-        int sel_disp = selected_disparity[d * cdisp_step2];
+                if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
+                if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
 
-        int x0 = x_out << level;
-        int y0 = y_out << level;
+		        volatile float* vdline = smem + winsz * threadIdx.z;
 
-        int len = ::min(y0 + winsz, rows) - y0;
+                if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
+                if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
+                if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
+                if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
+                if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
+                if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+
+                if (tid == 0)
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
+            }
+        }
 
-        float val = 0.0f;
-        if (x0 + tid < cols)
+        template <typename T>
+        void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
+                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
         {
-            if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-                for(int y = 0; y < len; ++y)
-                {
-                    val += DataCostPerPixel<channels>::compute(lle, lri);
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
 
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
+            switch(channels)
+            {
+            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
             }
         }
 
-        extern __shared__ float smem[];
-        float* dline = smem + winsz * threadIdx.z;
-
-        dline[tid] = val;
+        template <typename T, int winsz>
+        void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
 
-        __syncthreads();
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(nr_plane, threads.z);
 
-        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
-        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
+            switch (channels)
+            {
+            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+            }
+        }
 
-		volatile float* vdline = smem + winsz * threadIdx.z;
+        template<class T>
+        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
+        {
+            typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
 
-        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
-        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
-        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
-        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
-        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
-        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
+            static const ComputeDataCostCaller callers[] =
+            {
+                compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
+                compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
+                compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
+            };
+
+            size_t disp_step1 = msg_step1 * h;
+            size_t disp_step2 = msg_step2 * h2;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step1,  sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );
+
+            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
-    }
-}
+        template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
 
-template <typename T>
-void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
-                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+             
 
-    grid.x = divUp(w, threads.x);
-    grid.y = divUp(h, threads.y);
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// init message /////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-    switch(channels)
-    {
-    case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-    case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-    case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }
-}
-
-template <typename T, int winsz>
-void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-{
-    const int threadsNum = 256;
-    const size_t smem_size = threadsNum * sizeof(float);
+         
+         template <typename T>
+        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
+                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
+                                                     const T* data_cost_cur, const T* disparity_selected_cur,
+                                                     int nr_plane, int nr_plane2)
+        {
+            for(int i = 0; i < nr_plane; i++)
+            {
+                T minimum = numeric_limits<T>::max();
+                int id = 0;
+                for(int j = 0; j < nr_plane2; j++)
+                {
+                    T cur = data_cost_new[j * cdisp_step1];
+                    if(cur < minimum)
+                    {
+                        minimum = cur;
+                        id = j;
+                    }
+                }
 
-    dim3 threads(winsz, 1, threadsNum / winsz);
-    dim3 grid(w, h, 1);
-    grid.y *= divUp(nr_plane, threads.z);
+                data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
+                disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
 
-    switch (channels)
-    {
-    case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-    case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-    case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }
-}
-
-template<class T>
-void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
-                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
-{
-    typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-        int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
+                u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
+                d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
+                l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
+                r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
 
-    static const ComputeDataCostCaller callers[] =
-    {
-        compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
-        compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
-        compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
-    };
-
-    size_t disp_step1 = msg_step1 * h;
-    size_t disp_step2 = msg_step2 * h2;
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step1,  sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );
-
-    callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
-                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
-                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-     
-
-///////////////////////////////////////////////////////////////
-//////////////////////// init message /////////////////////////
-///////////////////////////////////////////////////////////////
-
- 
- template <typename T>
-__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
-                                             const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                                             T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
-                                             const T* data_cost_cur, const T* disparity_selected_cur,
-                                             int nr_plane, int nr_plane2)
-{
-    for(int i = 0; i < nr_plane; i++)
-    {
-        T minimum = numeric_limits<T>::max();
-        int id = 0;
-        for(int j = 0; j < nr_plane2; j++)
-        {
-            T cur = data_cost_new[j * cdisp_step1];
-            if(cur < minimum)
-            {
-                minimum = cur;
-                id = j;
+                data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
             }
         }
 
-        data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
-        disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
+        template <typename T>
+        __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
+                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
+                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                                     T* data_cost_selected_, const T* data_cost_,
+                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
+                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step2 + x/2;
+                const T* l_cur = l_cur_ + y/2                  * cmsg_step2 + ::min(w2-1, x/2 + 1);
+                const T* r_cur = r_cur_ + y/2                  * cmsg_step2 + ::max(0, x/2 - 1);
 
-        u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-        d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-        l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-        r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
+                T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;
 
-        data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
-    }
-}
+                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
+                const T* data_cost = data_cost_ + y * cmsg_step1 + x;
 
-template <typename T>
-__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
-                             const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
-                             T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                             T* data_cost_selected_, const T* data_cost_,
-                             int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+                for(int d = 0; d < nr_plane2; d++)
+                {
+                    int idx2 = d * cdisp_step2;
 
-    if (y < h && x < w)
-    {
-        const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
-        const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step2 + x/2;
-        const T* l_cur = l_cur_ + y/2                  * cmsg_step2 + ::min(w2-1, x/2 + 1);
-        const T* r_cur = r_cur_ + y/2                  * cmsg_step2 + ::max(0, x/2 - 1);
+                    T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
+                    data_cost_new[d * cdisp_step1] = val;
+                }
 
-        T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
+                T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
 
-        const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
-        const T* data_cost = data_cost_ + y * cmsg_step1 + x;
+                T* u_new = u_new_ + y * cmsg_step1 + x;
+                T* d_new = d_new_ + y * cmsg_step1 + x;
+                T* l_new = l_new_ + y * cmsg_step1 + x;
+                T* r_new = r_new_ + y * cmsg_step1 + x;
 
-        for(int d = 0; d < nr_plane2; d++)
-        {
-            int idx2 = d * cdisp_step2;
+                u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
+                d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
+                l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
+                r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
 
-            T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-            data_cost_new[d * cdisp_step1] = val;
+                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
+                                             data_cost_selected, disparity_selected_new, data_cost_new,
+                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2);
+            }
         }
 
-        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
 
-        T* u_new = u_new_ + y * cmsg_step1 + x;
-        T* d_new = d_new_ + y * cmsg_step1 + x;
-        T* l_new = l_new_ + y * cmsg_step1 + x;
-        T* r_new = r_new_ + y * cmsg_step1 + x;
+        template<class T>
+        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
+        {
 
-        u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
-        d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
-        l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
-        r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
+            size_t disp_step1 = msg_step1 * h;
+            size_t disp_step2 = msg_step2 * h2;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,   &msg_step1, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,   &msg_step2, sizeof(size_t)) );
 
-        get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-                                     data_cost_selected, disparity_selected_new, data_cost_new,
-                                     data_cost, disparity_selected_cur, nr_plane, nr_plane2);
-    }
-}
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
 
-template<class T>
-void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                  const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                  T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                  T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
-                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
-{
+            init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
+                                                       u_cur, d_cur, l_cur, r_cur,
+                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
+                                                       data_cost_selected, data_cost,
+                                                       h, w, nr_plane, h2, w2, nr_plane2);
+            cudaSafeCall( cudaGetLastError() );
 
-    size_t disp_step1 = msg_step1 * h;
-    size_t disp_step2 = msg_step2 * h2;
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,   &msg_step1, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,   &msg_step2, sizeof(size_t)) );
-
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(w, threads.x);
-    grid.y = divUp(h, threads.y);
-
-    init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
-                                               u_cur, d_cur, l_cur, r_cur,
-                                               selected_disp_pyr_new, selected_disp_pyr_cur,
-                                               data_cost_selected, data_cost,
-                                               h, w, nr_plane, h2, w2, nr_plane2);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-
-template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
-                  const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
-                  short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
-                  short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
-                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
-                  const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
-                  float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
-                  float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
-                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        
-
-///////////////////////////////////////////////////////////////
-////////////////////  calc all iterations /////////////////////
-///////////////////////////////////////////////////////////////
-
-template <typename T>
-__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
-                                  const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
-{
-    T minimum = numeric_limits<T>::max();
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    for(int d = 0; d < nr_plane; d++)
-    {
-        int idx = d * cdisp_step1;
-        T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
 
-        if(val < minimum)
-            minimum = val;
+        template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
+                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
+                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
+                          short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
 
-        msg_dst[idx] = val;
-    }
+        template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
+                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
+                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
+                          float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        
 
-    float sum = 0;
-    for(int d = 0; d < nr_plane; d++)
-    {
-        float cost_min = minimum + cmax_disc_term;
-        T src_disp_reg = src_disp[d * cdisp_step1];
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
 
-        for(int d2 = 0; d2 < nr_plane; d2++)
-            cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
+        template <typename T>
+        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
+                                          const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
+        {
+            T minimum = numeric_limits<T>::max();
 
-        temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
-        sum += cost_min;
-    }
-    sum /= nr_plane;
+            for(int d = 0; d < nr_plane; d++)
+            {
+                int idx = d * cdisp_step1;
+                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
 
-    for(int d = 0; d < nr_plane; d++)
-        msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
-}
+                if(val < minimum)
+                    minimum = val;
 
-template <typename T>
-__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
-{
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-    int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
+                msg_dst[idx] = val;
+            }
 
-    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-    {
-        const T* data = data_cost_selected + y * cmsg_step1 + x;
+            float sum = 0;
+            for(int d = 0; d < nr_plane; d++)
+            {
+                float cost_min = minimum + cmax_disc_term;
+                T src_disp_reg = src_disp[d * cdisp_step1];
+
+                for(int d2 = 0; d2 < nr_plane; d2++)
+                    cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
+
+                temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
+                sum += cost_min;
+            }
+            sum /= nr_plane;
 
-        T* u = u_ + y * cmsg_step1 + x;
-        T* d = d_ + y * cmsg_step1 + x;
-        T* l = l_ + y * cmsg_step1 + x;
-        T* r = r_ + y * cmsg_step1 + x;
+            for(int d = 0; d < nr_plane; d++)
+                msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
+        }
 
-        const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
+        template <typename T>
+        __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
+        {
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
 
-        T* temp = (T*)ctemp + y * cmsg_step1 + x;
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                const T* data = data_cost_selected + y * cmsg_step1 + x;
 
-        message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);
-        message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);
-        message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);
-        message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
-    }
-}
+                T* u = u_ + y * cmsg_step1 + x;
+                T* d = d_ + y * cmsg_step1 + x;
+                T* l = l_ + y * cmsg_step1 + x;
+                T* r = r_ + y * cmsg_step1 + x;
 
+                const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
 
-template<class T>
-void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-    const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
-{
-    size_t disp_step = msg_step * h;
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
+                T* temp = (T*)ctemp + y * cmsg_step1 + x;
 
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+                message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);
+                message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);
+                message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);
+                message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
+            }
+        }
 
-    grid.x = divUp(w, threads.x << 1);
-    grid.y = divUp(h, threads.y);
 
-    for(int t = 0; t < iters; ++t)
-    {
-        compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
-        cudaSafeCall( cudaGetLastError() );
+        template<class T>
+        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
+        {
+            size_t disp_step = msg_step * h;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
 
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
-    int h, int w, int nr_plane, int iters, cudaStream_t stream);
+            grid.x = divUp(w, threads.x << 1);
+            grid.y = divUp(h, threads.y);
 
-template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
-    int h, int w, int nr_plane, int iters, cudaStream_t stream);
+            for(int t = 0; t < iters; ++t)
+            {
+                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
+                cudaSafeCall( cudaGetLastError() );
 
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
 
-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
+        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, cudaStream_t stream);
 
+        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
+            int h, int w, int nr_plane, int iters, cudaStream_t stream);
 
-template <typename T>
-__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
-                             const T* data_cost_selected, const T* disp_selected_pyr,
-                             short* disp, size_t res_step, int cols, int rows, int nr_plane)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
-    {
-        const T* data = data_cost_selected + y * cmsg_step1 + x;
-        const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
 
-        const T* u = u_ + (y+1) * cmsg_step1 + (x+0);
-        const T* d = d_ + (y-1) * cmsg_step1 + (x+0);
-        const T* l = l_ + (y+0) * cmsg_step1 + (x+1);
-        const T* r = r_ + (y+0) * cmsg_step1 + (x-1);
 
-        int best = 0;
-        T best_val = numeric_limits<T>::max();
-        for (int i = 0; i < nr_plane; ++i)
+        template <typename T>
+        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
+                                     const T* data_cost_selected, const T* disp_selected_pyr,
+                                     short* disp, size_t res_step, int cols, int rows, int nr_plane)
         {
-            int idx = i * cdisp_step1;
-            T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            if (val < best_val)
+            if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
             {
-                best_val = val;
-                best = saturate_cast<short>(disp_selected[idx]);
-            }
-        }
-        disp[res_step * y + x] = best;
-    }
-}
+                const T* data = data_cost_selected + y * cmsg_step1 + x;
+                const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
 
-template<class T>
-void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
-{
-    size_t disp_step = disp.rows * msg_step;
-    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
+                const T* u = u_ + (y+1) * cmsg_step1 + (x+0);
+                const T* d = d_ + (y-1) * cmsg_step1 + (x+0);
+                const T* l = l_ + (y+0) * cmsg_step1 + (x+1);
+                const T* r = r_ + (y+0) * cmsg_step1 + (x-1);
 
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+                int best = 0;
+                T best_val = numeric_limits<T>::max();
+                for (int i = 0; i < nr_plane; ++i)
+                {
+                    int idx = i * cdisp_step1;
+                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = saturate_cast<short>(disp_selected[idx]);
+                    }
+                }
+                disp[res_step * y + x] = best;
+            }
+        }
 
-    grid.x = divUp(disp.cols, threads.x);
-    grid.y = divUp(disp.rows, threads.y);
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
+        {
+            size_t disp_step = disp.rows * msg_step;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
 
-    compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
-                                               disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
-    cudaSafeCall( cudaGetLastError() );
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
 
-template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
-    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
+                                                       disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
+            cudaSafeCall( cudaGetLastError() );
 
-template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-} // namespace stereocsbp
+        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
+            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
+            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+    } // namespace stereocsbp
+}}} // namespace cv { namespace gpu { namespace device {
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index afd81a6..1325c95 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -52,951 +52,949 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace surf {
-
-////////////////////////////////////////////////////////////////////////
-// Global parameters
-
-// The maximum number of features (before subpixel interpolation) that memory is reserved for.
-__constant__ int c_max_candidates;
-// The maximum number of features that memory is reserved for.
-__constant__ int c_max_features;
-// The image size.
-__constant__ int c_img_rows;
-__constant__ int c_img_cols;
-// The number of layers.
-__constant__ int c_nOctaveLayers;
-// The hessian threshold.
-__constant__ float c_hessianThreshold;
-
-// The current octave.
-__constant__ int c_octave;
-// The current layer size.
-__constant__ int c_layer_rows;
-__constant__ int c_layer_cols;
-
-void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
-}
-
-void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
-    cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
-}
-
-////////////////////////////////////////////////////////////////////////
-// Integral image texture
+    namespace surf 
+    {
+        ////////////////////////////////////////////////////////////////////////
+        // Global parameters
+
+        // The maximum number of features (before subpixel interpolation) that memory is reserved for.
+        __constant__ int c_max_candidates;
+        // The maximum number of features that memory is reserved for.
+        __constant__ int c_max_features;
+        // The image size.
+        __constant__ int c_img_rows;
+        __constant__ int c_img_cols;
+        // The number of layers.
+        __constant__ int c_nOctaveLayers;
+        // The hessian threshold.
+        __constant__ float c_hessianThreshold;
+
+        // The current octave.
+        __constant__ int c_octave;
+        // The current layer size.
+        __constant__ int c_layer_rows;
+        __constant__ int c_layer_cols;
+
+        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
+        }
 
-texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
+        }
 
-void bindImgTex(DevMem2Db img)
-{
-    bindTexture(&imgTex, img);
-}
-void bindSumTex(DevMem2D_<uint> sum)
-{
-    bindTexture(&sumTex, sum);
-}
-void bindMaskSumTex(DevMem2D_<uint> maskSum)
-{
-    bindTexture(&maskSumTex, maskSum);
-}
+        ////////////////////////////////////////////////////////////////////////
+        // Integral image texture
 
-template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
-{
-#if __CUDA_ARCH__ >= 200
-    typedef double real_t;        
-#else
-    typedef float  real_t;
-#endif
-
-    float ratio = (float)newSize / oldSize;
-    
-    real_t d = 0;
-
-    #pragma unroll
-    for (int k = 0; k < N; ++k)
-    {
-        int dx1 = __float2int_rn(ratio * src[k][0]);
-        int dy1 = __float2int_rn(ratio * src[k][1]);
-        int dx2 = __float2int_rn(ratio * src[k][2]);
-        int dy2 = __float2int_rn(ratio * src[k][3]);
+        texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 
-        real_t t = 0;
-        t += tex2D(sumTex, x + dx1, y + dy1);
-        t -= tex2D(sumTex, x + dx1, y + dy2);
-        t -= tex2D(sumTex, x + dx2, y + dy1);
-        t += tex2D(sumTex, x + dx2, y + dy2);
+        void bindImgTex(DevMem2Db img)
+        {
+            bindTexture(&imgTex, img);
+        }
+        void bindSumTex(DevMem2D_<uint> sum)
+        {
+            bindTexture(&sumTex, sum);
+        }
+        void bindMaskSumTex(DevMem2D_<uint> maskSum)
+        {
+            bindTexture(&maskSumTex, maskSum);
+        }
 
-        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
-    }
+        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
+        {
+        #if __CUDA_ARCH__ >= 200
+            typedef double real_t;        
+        #else
+            typedef float  real_t;
+        #endif
 
-    return (float)d;
-}
+            float ratio = (float)newSize / oldSize;
+            
+            real_t d = 0;
 
-////////////////////////////////////////////////////////////////////////
-// Hessian
+            #pragma unroll
+            for (int k = 0; k < N; ++k)
+            {
+                int dx1 = __float2int_rn(ratio * src[k][0]);
+                int dy1 = __float2int_rn(ratio * src[k][1]);
+                int dx2 = __float2int_rn(ratio * src[k][2]);
+                int dy2 = __float2int_rn(ratio * src[k][3]);
+
+                real_t t = 0;
+                t += tex2D(sumTex, x + dx1, y + dy1);
+                t -= tex2D(sumTex, x + dx1, y + dy2);
+                t -= tex2D(sumTex, x + dx2, y + dy1);
+                t += tex2D(sumTex, x + dx2, y + dy2);
+
+                d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+            }
 
-__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
-__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
-__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
+            return (float)d;
+        }
 
-__host__ __device__ __forceinline__ int calcSize(int octave, int layer)
-{
-    /* Wavelet size at first layer of first octave. */
-    const int HAAR_SIZE0 = 9;
+        ////////////////////////////////////////////////////////////////////////
+        // Hessian
 
-    /* Wavelet size increment between layers. This should be an even number,
-     such that the wavelet sizes in an octave are either all even or all odd.
-     This ensures that when looking for the neighbours of a sample, the layers
-     above and below are aligned correctly. */
-    const int HAAR_SIZE_INC = 6;
+        __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
+        __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
+        __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
 
-    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
-}
+        __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
+        {
+            /* Wavelet size at first layer of first octave. */
+            const int HAAR_SIZE0 = 9;
 
-__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
-{
-    // Determine the indices
-    const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
-    const int blockIdx_y = blockIdx.y % gridDim_y;
-    const int blockIdx_z = blockIdx.y / gridDim_y;
+            /* Wavelet size increment between layers. This should be an even number,
+             such that the wavelet sizes in an octave are either all even or all odd.
+             This ensures that when looking for the neighbours of a sample, the layers
+             above and below are aligned correctly. */
+            const int HAAR_SIZE_INC = 6;
 
-    const int j = threadIdx.x + blockIdx.x * blockDim.x;
-    const int i = threadIdx.y + blockIdx_y * blockDim.y;
-    const int layer = blockIdx_z;
+            return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+        }
 
-    const int size = calcSize(c_octave, layer);
+        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
+        {
+            // Determine the indices
+            const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
+            const int blockIdx_y = blockIdx.y % gridDim_y;
+            const int blockIdx_z = blockIdx.y / gridDim_y;
 
-    const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
-    const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
+            const int j = threadIdx.x + blockIdx.x * blockDim.x;
+            const int i = threadIdx.y + blockIdx_y * blockDim.y;
+            const int layer = blockIdx_z;
 
-    // Ignore pixels where some of the kernel is outside the image
-    const int margin = (size >> 1) >> c_octave;
+            const int size = calcSize(c_octave, layer);
 
-    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
-    {
-        const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);
-        const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);
-        const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);
+            const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
+            const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
 
-        det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
-        trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
-    }
-}
+            // Ignore pixels where some of the kernel is outside the image
+            const int margin = (size >> 1) >> c_octave;
 
-void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
-{
-    const int min_size = calcSize(octave, 0);
-    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
-    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
+            if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
+            {
+                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);
+                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);
+                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);
 
-    dim3 threads(16, 16);
+                det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
+                trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
+            }
+        }
 
-    dim3 grid;
-    grid.x = divUp(max_samples_j, threads.x);
-    grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
+        {
+            const int min_size = calcSize(octave, 0);
+            const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
+            const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
 
-    icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
-    cudaSafeCall( cudaGetLastError() );
+            dim3 threads(16, 16);
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+            dim3 grid;
+            grid.x = divUp(max_samples_j, threads.x);
+            grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
 
-////////////////////////////////////////////////////////////////////////
-// NONMAX
+            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
+            cudaSafeCall( cudaGetLastError() );
 
-__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-struct WithMask
-{
-    static __device__ bool check(int sum_i, int sum_j, int size)
-    {
-        float ratio = (float)size / 9.0f;
-        
-        float d = 0;
+        ////////////////////////////////////////////////////////////////////////
+        // NONMAX
 
-        int dx1 = __float2int_rn(ratio * c_DM[0]);
-        int dy1 = __float2int_rn(ratio * c_DM[1]);
-        int dx2 = __float2int_rn(ratio * c_DM[2]);
-        int dy2 = __float2int_rn(ratio * c_DM[3]);
+        __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
 
-        float t = 0;
-        t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
-        t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
-        t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
-        t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
+        struct WithMask
+        {
+            static __device__ bool check(int sum_i, int sum_j, int size)
+            {
+                float ratio = (float)size / 9.0f;
+                
+                float d = 0;
 
-        d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
+                int dx1 = __float2int_rn(ratio * c_DM[0]);
+                int dy1 = __float2int_rn(ratio * c_DM[1]);
+                int dx2 = __float2int_rn(ratio * c_DM[2]);
+                int dy2 = __float2int_rn(ratio * c_DM[3]);
 
-        return (d >= 0.5f);
-    }
-};
+                float t = 0;
+                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
+                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
+                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
+                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
 
-template <typename Mask>
-__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
-{
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+                d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
 
-    extern __shared__ float N9[];
+                return (d >= 0.5f);
+            }
+        };
 
-    // The hidx variables are the indices to the hessian buffer.
-    const int gridDim_y = gridDim.y / c_nOctaveLayers;
-    const int blockIdx_y = blockIdx.y % gridDim_y;
-    const int blockIdx_z = blockIdx.y / gridDim_y;
+        template <typename Mask>
+        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
+        {
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-    const int layer = blockIdx_z + 1;
+            extern __shared__ float N9[];
 
-    const int size = calcSize(c_octave, layer);
+            // The hidx variables are the indices to the hessian buffer.
+            const int gridDim_y = gridDim.y / c_nOctaveLayers;
+            const int blockIdx_y = blockIdx.y % gridDim_y;
+            const int blockIdx_z = blockIdx.y / gridDim_y;
 
-    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
-    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
+            const int layer = blockIdx_z + 1;
 
-    const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
-    const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
+            const int size = calcSize(c_octave, layer);
 
-    // Is this thread within the hessian buffer?
-    const int zoff = blockDim.x * blockDim.y;
-    const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
-    N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-    N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-    N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-    __syncthreads();
+            // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+            const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
 
-    if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
-    {
-        float val0 = N9[localLin];
+            const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
+            const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
 
-        if (val0 > c_hessianThreshold)
-        {
-            // Coordinates for the start of the wavelet in the sum image. There
-            // is some integer division involved, so don't try to simplify this
-            // (cancel out sampleStep) without checking the result is the same
-            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
-            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
+            // Is this thread within the hessian buffer?
+            const int zoff = blockDim.x * blockDim.y;
+            const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
+            N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
+            N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
+            N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
+            __syncthreads();
 
-            if (Mask::check(sum_i, sum_j, size))
+            if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
             {
-                // Check to see if we have a max (in its 26 neighbours)
-                const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
-                &&                   val0 > N9[localLin     - blockDim.x - zoff]
-                &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
-                &&                   val0 > N9[localLin - 1              - zoff]
-                &&                   val0 > N9[localLin                  - zoff]
-                &&                   val0 > N9[localLin + 1              - zoff]
-                &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
-                &&                   val0 > N9[localLin     + blockDim.x - zoff]
-                &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]
-
-                &&                   val0 > N9[localLin - 1 - blockDim.x]
-                &&                   val0 > N9[localLin     - blockDim.x]
-                &&                   val0 > N9[localLin + 1 - blockDim.x]
-                &&                   val0 > N9[localLin - 1             ]
-                &&                   val0 > N9[localLin + 1             ]
-                &&                   val0 > N9[localLin - 1 + blockDim.x]
-                &&                   val0 > N9[localLin     + blockDim.x]
-                &&                   val0 > N9[localLin + 1 + blockDim.x]
-
-                &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
-                &&                   val0 > N9[localLin     - blockDim.x + zoff]
-                &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
-                &&                   val0 > N9[localLin - 1              + zoff]
-                &&                   val0 > N9[localLin                  + zoff]
-                &&                   val0 > N9[localLin + 1              + zoff]
-                &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
-                &&                   val0 > N9[localLin     + blockDim.x + zoff]
-                &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
-                ;
-
-                if(condmax)
+                float val0 = N9[localLin];
+
+                if (val0 > c_hessianThreshold)
                 {
-                    unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
+                    // Coordinates for the start of the wavelet in the sum image. There
+                    // is some integer division involved, so don't try to simplify this
+                    // (cancel out sampleStep) without checking the result is the same
+                    const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
+                    const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 
-                    if (ind < c_max_candidates)
+                    if (Mask::check(sum_i, sum_j, size))
                     {
-                        const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
-
-                        maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
+                        // Check to see if we have a max (in its 26 neighbours)
+                        const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
+                        &&                   val0 > N9[localLin     - blockDim.x - zoff]
+                        &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
+                        &&                   val0 > N9[localLin - 1              - zoff]
+                        &&                   val0 > N9[localLin                  - zoff]
+                        &&                   val0 > N9[localLin + 1              - zoff]
+                        &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
+                        &&                   val0 > N9[localLin     + blockDim.x - zoff]
+                        &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]
+
+                        &&                   val0 > N9[localLin - 1 - blockDim.x]
+                        &&                   val0 > N9[localLin     - blockDim.x]
+                        &&                   val0 > N9[localLin + 1 - blockDim.x]
+                        &&                   val0 > N9[localLin - 1             ]
+                        &&                   val0 > N9[localLin + 1             ]
+                        &&                   val0 > N9[localLin - 1 + blockDim.x]
+                        &&                   val0 > N9[localLin     + blockDim.x]
+                        &&                   val0 > N9[localLin + 1 + blockDim.x]
+
+                        &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
+                        &&                   val0 > N9[localLin     - blockDim.x + zoff]
+                        &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
+                        &&                   val0 > N9[localLin - 1              + zoff]
+                        &&                   val0 > N9[localLin                  + zoff]
+                        &&                   val0 > N9[localLin + 1              + zoff]
+                        &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
+                        &&                   val0 > N9[localLin     + blockDim.x + zoff]
+                        &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
+                        ;
+
+                        if(condmax)
+                        {
+                            unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
+
+                            if (ind < c_max_candidates)
+                            {
+                                const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
+
+                                maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
+                            }
+                        }
                     }
                 }
             }
-        }
-    }
 
-    #endif
-}
+            #endif
+        }
 
-void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
-    int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
-{
-    const int layer_rows = img_rows >> octave;
-    const int layer_cols = img_cols >> octave;
+        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
+        {
+            const int layer_rows = img_rows >> octave;
+            const int layer_cols = img_cols >> octave;
 
-    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+            const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
 
-    dim3 threads(16, 16);
+            dim3 threads(16, 16);
 
-    dim3 grid;
-    grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
-    grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
+            dim3 grid;
+            grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
+            grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
 
-    const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
+            const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
 
-    if (use_mask)
-        icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
-    else
-        icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+            if (use_mask)
+                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+            else
+                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 
-    cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaGetLastError() );
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-////////////////////////////////////////////////////////////////////////
-// INTERPOLATION
+        ////////////////////////////////////////////////////////////////////////
+        // INTERPOLATION
 
-__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
-    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
-    unsigned int* featureCounter)
-{
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
+            float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
+            unsigned int* featureCounter)
+        {
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-    const int4 maxPos = maxPosBuffer[blockIdx.x];
+            const int4 maxPos = maxPosBuffer[blockIdx.x];
 
-    const int j = maxPos.x - 1 + threadIdx.x;
-    const int i = maxPos.y - 1 + threadIdx.y;
-    const int layer = maxPos.z - 1 + threadIdx.z;
+            const int j = maxPos.x - 1 + threadIdx.x;
+            const int i = maxPos.y - 1 + threadIdx.y;
+            const int layer = maxPos.z - 1 + threadIdx.z;
 
-    __shared__ float N9[3][3][3];
+            __shared__ float N9[3][3][3];
 
-    N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
-    __syncthreads();
+            N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
+            __syncthreads();
 
-    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
-    {
-        __shared__ float dD[3];
-
-        //dx
-        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
-        //dy
-        dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
-        //ds
-        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
-
-        __shared__ float H[3][3];
-
-        //dxx
-        H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
-        //dxy
-        H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
-        //dxs
-        H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
-        //dyx = dxy
-        H[1][0] = H[0][1];
-        //dyy
-        H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
-        //dys
-        H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
-        //dsx = dxs
-        H[2][0] = H[0][2];
-        //dsy = dys
-        H[2][1] = H[1][2];
-        //dss
-        H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
-
-        __shared__ float x[3];
-
-        if (solve3x3(H, dD, x))
-        {
-            if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
+            if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
             {
-                // if the step is within the interpolation region, perform it
-                
-                const int size = calcSize(c_octave, maxPos.z);
-
-                const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
-                const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
-                
-                const float center_i = sum_i + (float)(size - 1) / 2;
-                const float center_j = sum_j + (float)(size - 1) / 2;
-
-                const float px = center_j + x[0] * (1 << c_octave);
-                const float py = center_i + x[1] * (1 << c_octave);
+                __shared__ float dD[3];
+
+                //dx
+                dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
+                //dy
+                dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
+                //ds
+                dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
+
+                __shared__ float H[3][3];
+
+                //dxx
+                H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
+                //dxy
+                H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
+                //dxs
+                H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
+                //dyx = dxy
+                H[1][0] = H[0][1];
+                //dyy
+                H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
+                //dys
+                H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
+                //dsx = dxs
+                H[2][0] = H[0][2];
+                //dsy = dys
+                H[2][1] = H[1][2];
+                //dss
+                H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
+
+                __shared__ float x[3];
+
+                if (solve3x3(H, dD, x))
+                {
+                    if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
+                    {
+                        // if the step is within the interpolation region, perform it
+                        
+                        const int size = calcSize(c_octave, maxPos.z);
+
+                        const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
+                        const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
+                        
+                        const float center_i = sum_i + (float)(size - 1) / 2;
+                        const float center_j = sum_j + (float)(size - 1) / 2;
+
+                        const float px = center_j + x[0] * (1 << c_octave);
+                        const float py = center_i + x[1] * (1 << c_octave);
+
+                        const int ds = size - calcSize(c_octave, maxPos.z - 1);
+                        const float psize = roundf(size + x[2] * ds);
+
+                        /* The sampling intervals and wavelet sized for selecting an orientation
+                         and building the keypoint descriptor are defined relative to 's' */
+                        const float s = psize * 1.2f / 9.0f;
+
+                        /* To find the dominant orientation, the gradients in x and y are
+                         sampled in a circle of radius 6s using wavelets of size 4s.
+                         We ensure the gradient wavelet size is even to ensure the
+                         wavelet pattern is balanced and symmetric around its center */
+                        const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
+
+                        // check when grad_wav_size is too big
+                        if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+                        {
+                            // Get a new feature index.
+                            unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
+
+                            if (ind < c_max_features)
+                            {
+                                featureX[ind] = px;
+                                featureY[ind] = py;
+                                featureLaplacian[ind] = maxPos.w;
+                                featureSize[ind] = psize;
+                                featureHessian[ind] = N9[1][1][1];
+                            }
+                        } // grad_wav_size check
+                    } // If the subpixel interpolation worked
+                }
+            } // If this is thread 0.
 
-                const int ds = size - calcSize(c_octave, maxPos.z - 1);
-                const float psize = roundf(size + x[2] * ds);
+            #endif
+        }
 
-                /* The sampling intervals and wavelet sized for selecting an orientation
-                 and building the keypoint descriptor are defined relative to 's' */
-                const float s = psize * 1.2f / 9.0f;
+        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
+            float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
+            unsigned int* featureCounter)
+        {
+            dim3 threads;
+            threads.x = 3;
+            threads.y = 3;
+            threads.z = 3;
 
-                /* To find the dominant orientation, the gradients in x and y are
-                 sampled in a circle of radius 6s using wavelets of size 4s.
-                 We ensure the gradient wavelet size is even to ensure the
-                 wavelet pattern is balanced and symmetric around its center */
-                const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
+            dim3 grid;
+            grid.x = maxCounter;
 
-                // check when grad_wav_size is too big
-                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
-                {
-                    // Get a new feature index.
-                    unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
+            icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
+            cudaSafeCall( cudaGetLastError() );
 
-                    if (ind < c_max_features)
-                    {
-                        featureX[ind] = px;
-                        featureY[ind] = py;
-                        featureLaplacian[ind] = maxPos.w;
-                        featureSize[ind] = psize;
-                        featureHessian[ind] = N9[1][1][1];
-                    }
-                } // grad_wav_size check
-            } // If the subpixel interpolation worked
+            cudaSafeCall( cudaDeviceSynchronize() );
         }
-    } // If this is thread 0.
 
-    #endif
-}
+        ////////////////////////////////////////////////////////////////////////
+        // Orientation
 
-void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
-    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
-    unsigned int* featureCounter)
-{
-    dim3 threads;
-    threads.x = 3;
-    threads.y = 3;
-    threads.z = 3;
+        #define ORI_SEARCH_INC 5
+        #define ORI_WIN        60
+        #define ORI_SAMPLES    113
 
-    dim3 grid;
-    grid.x = maxCounter;
+        __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
+        __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
+        __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
 
-    icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
-    cudaSafeCall( cudaGetLastError() );
+        __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
+        __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+        __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
+        {        
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-////////////////////////////////////////////////////////////////////////
-// Orientation
+            __shared__ float s_X[128];
+            __shared__ float s_Y[128];
+            __shared__ float s_angle[128];
 
-#define ORI_SEARCH_INC 5
-#define ORI_WIN        60
-#define ORI_SAMPLES    113
+            __shared__ float s_sum[32 * 4];
 
-__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
-__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
+            /* The sampling intervals and wavelet sized for selecting an orientation
+             and building the keypoint descriptor are defined relative to 's' */
+            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
 
-__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
-__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
+            /* To find the dominant orientation, the gradients in x and y are
+             sampled in a circle of radius 6s using wavelets of size 4s.
+             We ensure the gradient wavelet size is even to ensure the
+             wavelet pattern is balanced and symmetric around its center */
+            const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
 
-__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
-{        
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+            // check when grad_wav_size is too big
+            if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+            {
+                // Calc X, Y, angle and store it to shared memory
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-    __shared__ float s_X[128];
-    __shared__ float s_Y[128];
-    __shared__ float s_angle[128];
+                float X = 0.0f, Y = 0.0f, angle = 0.0f;
 
-    __shared__ float s_sum[32 * 4];
+                if (tid < ORI_SAMPLES)
+                {
+                    const float margin = (float)(grad_wav_size - 1) / 2.0f;
+                    const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
+                    const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
 
-    /* The sampling intervals and wavelet sized for selecting an orientation
-     and building the keypoint descriptor are defined relative to 's' */
-    const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
+                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
+                    {
+                        X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
+                        Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
+                    
+                        angle = atan2f(Y, X);
+                        if (angle < 0)
+                            angle += 2.0f * CV_PI;
+                        angle *= 180.0f / CV_PI;
+                    }
+                }
+                s_X[tid] = X;
+                s_Y[tid] = Y;
+                s_angle[tid] = angle;
+                __syncthreads();
 
-    /* To find the dominant orientation, the gradients in x and y are
-     sampled in a circle of radius 6s using wavelets of size 4s.
-     We ensure the gradient wavelet size is even to ensure the
-     wavelet pattern is balanced and symmetric around its center */
-    const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
+                float bestx = 0, besty = 0, best_mod = 0;
 
-    // check when grad_wav_size is too big
-    if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
-    {
-        // Calc X, Y, angle and store it to shared memory
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                #pragma unroll
+                for (int i = 0; i < 18; ++i)
+                {
+                    const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
 
-        float X = 0.0f, Y = 0.0f, angle = 0.0f;
+                    float sumx = 0.0f, sumy = 0.0f;
+                    int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
+                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                    {
+                        sumx = s_X[threadIdx.x];
+                        sumy = s_Y[threadIdx.x];
+                    }
+                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
+                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                    {
+                        sumx += s_X[threadIdx.x + 32];
+                        sumy += s_Y[threadIdx.x + 32];
+                    }
+                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
+                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                    {
+                        sumx += s_X[threadIdx.x + 64];
+                        sumy += s_Y[threadIdx.x + 64];
+                    }
+                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
+                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                    {
+                        sumx += s_X[threadIdx.x + 96];
+                        sumy += s_Y[threadIdx.x + 96];
+                    }
 
-        if (tid < ORI_SAMPLES)
-        {
-            const float margin = (float)(grad_wav_size - 1) / 2.0f;
-            const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
-            const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
+                    float* s_sum_row = s_sum + threadIdx.y * 32;
 
-            if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
-            {
-                X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
-            
-                angle = atan2f(Y, X);
-                if (angle < 0)
-                    angle += 2.0f * CV_PI;
-                angle *= 180.0f / CV_PI;
-            }
-        }
-        s_X[tid] = X;
-        s_Y[tid] = Y;
-        s_angle[tid] = angle;
-        __syncthreads();
+                    device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
+                    device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
 
-        float bestx = 0, besty = 0, best_mod = 0;
+                    const float temp_mod = sumx * sumx + sumy * sumy;
+                    if (temp_mod > best_mod)
+                    {
+                        best_mod = temp_mod;
+                        bestx = sumx;
+                        besty = sumy;
+                    }
+                }
 
-        #pragma unroll
-        for (int i = 0; i < 18; ++i)
-        {
-            const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+                if (threadIdx.x == 0)
+                {
+                    s_X[threadIdx.y] = bestx;
+                    s_Y[threadIdx.y] = besty;
+                    s_angle[threadIdx.y] = best_mod;
+                }
+                __syncthreads();
 
-            float sumx = 0.0f, sumy = 0.0f;
-            int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
-            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-            {
-                sumx = s_X[threadIdx.x];
-                sumy = s_Y[threadIdx.x];
-            }
-            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
-            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-            {
-                sumx += s_X[threadIdx.x + 32];
-                sumy += s_Y[threadIdx.x + 32];
-            }
-            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
-            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-            {
-                sumx += s_X[threadIdx.x + 64];
-                sumy += s_Y[threadIdx.x + 64];
-            }
-            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
-            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-            {
-                sumx += s_X[threadIdx.x + 96];
-                sumy += s_Y[threadIdx.x + 96];
-            }
+                if (threadIdx.x < 2 && threadIdx.y == 0)
+                {
+                    volatile float* v_x = s_X;
+                    volatile float* v_y = s_Y;
+                    volatile float* v_mod = s_angle;
+
+                    bestx = v_x[threadIdx.x];
+                    besty = v_y[threadIdx.x];
+                    best_mod = v_mod[threadIdx.x];
 
-            float* s_sum_row = s_sum + threadIdx.y * 32;
+                    float temp_mod = v_mod[threadIdx.x + 2];
+                    if (temp_mod > best_mod)
+                    {
+                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
+                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
+                        v_mod[threadIdx.x] = best_mod = temp_mod;
+                    }
+                    temp_mod = v_mod[threadIdx.x + 1];
+                    if (temp_mod > best_mod)
+                    {
+                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
+                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
+                    }
+                }
 
-            device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
-            device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
+                if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
+                {
+                    float kp_dir = atan2f(besty, bestx);
+                    if (kp_dir < 0)
+                        kp_dir += 2.0f * CV_PI;
+                    kp_dir *= 180.0f / CV_PI;
 
-            const float temp_mod = sumx * sumx + sumy * sumy;
-            if (temp_mod > best_mod)
-            {
-                best_mod = temp_mod;
-                bestx = sumx;
-                besty = sumy;
+                    featureDir[blockIdx.x] = kp_dir;
+                }
             }
-        }
 
-        if (threadIdx.x == 0)
-        {
-            s_X[threadIdx.y] = bestx;
-            s_Y[threadIdx.y] = besty;
-            s_angle[threadIdx.y] = best_mod;
+            #endif
         }
-        __syncthreads();
-
-        if (threadIdx.x < 2 && threadIdx.y == 0)
-        {
-            volatile float* v_x = s_X;
-            volatile float* v_y = s_Y;
-            volatile float* v_mod = s_angle;
 
-            bestx = v_x[threadIdx.x];
-            besty = v_y[threadIdx.x];
-            best_mod = v_mod[threadIdx.x];
+        #undef ORI_SEARCH_INC
+        #undef ORI_WIN
+        #undef ORI_SAMPLES
 
-            float temp_mod = v_mod[threadIdx.x + 2];
-            if (temp_mod > best_mod)
-            {
-                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
-                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
-                v_mod[threadIdx.x] = best_mod = temp_mod;
-            }
-            temp_mod = v_mod[threadIdx.x + 1];
-            if (temp_mod > best_mod)
-            {
-                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
-                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
-            }
-        }
-
-        if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
+        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) 
         {
-            float kp_dir = atan2f(besty, bestx);
-            if (kp_dir < 0)
-                kp_dir += 2.0f * CV_PI;
-            kp_dir *= 180.0f / CV_PI;
+            dim3 threads;
+            threads.x = 32;
+            threads.y = 4;
 
-            featureDir[blockIdx.x] = kp_dir;
-        }
-    }
+            dim3 grid;
+            grid.x = nFeatures;
 
-    #endif
-}
-
-#undef ORI_SEARCH_INC
-#undef ORI_WIN
-#undef ORI_SAMPLES
-
-void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) 
-{
-    dim3 threads;
-    threads.x = 32;
-    threads.y = 4;
+            icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
+            cudaSafeCall( cudaGetLastError() );
 
-    dim3 grid;
-    grid.x = nFeatures;
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-    icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
-    cudaSafeCall( cudaGetLastError() );
+        ////////////////////////////////////////////////////////////////////////
+        // Descriptors
 
-    cudaSafeCall( cudaDeviceSynchronize() );
-}
+        #define PATCH_SZ 20
 
-////////////////////////////////////////////////////////////////////////
-// Descriptors
+        __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = 
+        {
+            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 
+            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
+            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
+            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
+            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
+            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
+            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
+            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
+            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
+            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
+            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
+            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
+            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
+            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
+            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
+            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
+            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
+            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
+            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
+            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
+        };
+
+        struct WinReader
+        {
+            typedef uchar elem_type;
 
-#define PATCH_SZ 20
+            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
+                centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
+            {
+            }
 
-__constant__ float c_DW[PATCH_SZ * PATCH_SZ] = 
-{
-    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 
-    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
-    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
-    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
-    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
-    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
-    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
-    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
-    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
-    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
-    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
-    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
-    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
-    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
-    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
-    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
-    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
-    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
-    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
-    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
-};
-
-struct WinReader
-{
-    typedef uchar elem_type;
+            __device__ __forceinline__ uchar operator ()(int i, int j) const
+            {
+                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+                float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
 
-    __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
-        centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
-    {
-    }
+                return tex2D(imgTex, pixel_x, pixel_y);
+            }
 
-    __device__ __forceinline__ uchar operator ()(int i, int j) const
-    {
-        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
-        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
+            float centerX; 
+            float centerY;
+            float win_offset; 
+            float cos_dir; 
+            float sin_dir;
+        };
 
-        return tex2D(imgTex, pixel_x, pixel_y);
-    }
+        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
+            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            __shared__ float s_PATCH[6][6];
 
-    float centerX; 
-    float centerY;
-    float win_offset; 
-    float cos_dir; 
-    float sin_dir;
-};
+            const float centerX = featureX[blockIdx.x];
+            const float centerY = featureY[blockIdx.x];
+            const float size = featureSize[blockIdx.x];
+            const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);
 
-__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
-    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-{
-    __shared__ float s_PATCH[6][6];
+            /* The sampling intervals and wavelet sized for selecting an orientation
+             and building the keypoint descriptor are defined relative to 's' */
+            const float s = size * 1.2f / 9.0f;
 
-    const float centerX = featureX[blockIdx.x];
-    const float centerY = featureY[blockIdx.x];
-    const float size = featureSize[blockIdx.x];
-    const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);
+            /* Extract a window of pixels around the keypoint of size 20s */
+            const int win_size = (int)((PATCH_SZ + 1) * s);
 
-    /* The sampling intervals and wavelet sized for selecting an orientation
-     and building the keypoint descriptor are defined relative to 's' */
-    const float s = size * 1.2f / 9.0f;
+            float sin_dir;
+            float cos_dir;
+            sincosf(descriptor_dir, &sin_dir, &cos_dir);
 
-    /* Extract a window of pixels around the keypoint of size 20s */
-    const int win_size = (int)((PATCH_SZ + 1) * s);
+            /* Nearest neighbour version (faster) */
+            const float win_offset = -(float)(win_size - 1) / 2; 
 
-    float sin_dir;
-    float cos_dir;
-    sincosf(descriptor_dir, &sin_dir, &cos_dir);
+            // Compute sampling points
+            // since grids are 2D, need to compute xBlock and yBlock indices
+            const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
+            const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
+            const int xIndex = xBlock * 5 + threadIdx.x;
+            const int yIndex = yBlock * 5 + threadIdx.y;
 
-    /* Nearest neighbour version (faster) */
-    const float win_offset = -(float)(win_size - 1) / 2; 
+            const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
+            const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
 
-    // Compute sampling points
-    // since grids are 2D, need to compute xBlock and yBlock indices
-    const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
-    const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
-    const int xIndex = xBlock * 5 + threadIdx.x;
-    const int yIndex = yBlock * 5 + threadIdx.y;
+            LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
 
-    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
-    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
+            s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
 
-    LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
+            __syncthreads();
 
-    s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
+            if (threadIdx.x < 5 && threadIdx.y < 5)
+            {
+                const int tid = threadIdx.y * 5 + threadIdx.x;
 
-    __syncthreads();
+                const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
 
-    if (threadIdx.x < 5 && threadIdx.y < 5)
-    {
-        const int tid = threadIdx.y * 5 + threadIdx.x;
+                const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
+                const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
 
-        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
+                s_dx_bin[tid] = vx;
+                s_dy_bin[tid] = vy;
+            }
+        }
 
-        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
-        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
+        __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
+        {
+            // first step is to reduce from 25 to 16
+            if (tid < 9) // use 9 threads
+            {
+                sdata1[tid] += sdata1[tid + 16];
+                sdata2[tid] += sdata2[tid + 16];
+                sdata3[tid] += sdata3[tid + 16];
+                sdata4[tid] += sdata4[tid + 16];
+            }
 
-        s_dx_bin[tid] = vx;
-        s_dy_bin[tid] = vy;
-    }
-}
+            // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+            if (tid < 8)
+            {
+                sdata1[tid] += sdata1[tid + 8];
+                sdata1[tid] += sdata1[tid + 4];
+                sdata1[tid] += sdata1[tid + 2];
+                sdata1[tid] += sdata1[tid + 1];
+
+                sdata2[tid] += sdata2[tid + 8];
+                sdata2[tid] += sdata2[tid + 4];
+                sdata2[tid] += sdata2[tid + 2];
+                sdata2[tid] += sdata2[tid + 1];
+
+                sdata3[tid] += sdata3[tid + 8];
+                sdata3[tid] += sdata3[tid + 4];
+                sdata3[tid] += sdata3[tid + 2];
+                sdata3[tid] += sdata3[tid + 1];
+
+                sdata4[tid] += sdata4[tid + 8];
+                sdata4[tid] += sdata4[tid + 4];
+                sdata4[tid] += sdata4[tid + 2];
+                sdata4[tid] += sdata4[tid + 1];
+            }
+        }
 
-__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
-{
-    // first step is to reduce from 25 to 16
-    if (tid < 9) // use 9 threads
-    {
-        sdata1[tid] += sdata1[tid + 16];
-        sdata2[tid] += sdata2[tid + 16];
-        sdata3[tid] += sdata3[tid + 16];
-        sdata4[tid] += sdata4[tid + 16];
-    }
-
-    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-    if (tid < 8)
-    {
-        sdata1[tid] += sdata1[tid + 8];
-        sdata1[tid] += sdata1[tid + 4];
-        sdata1[tid] += sdata1[tid + 2];
-        sdata1[tid] += sdata1[tid + 1];
-
-        sdata2[tid] += sdata2[tid + 8];
-        sdata2[tid] += sdata2[tid + 4];
-        sdata2[tid] += sdata2[tid + 2];
-        sdata2[tid] += sdata2[tid + 1];
-
-        sdata3[tid] += sdata3[tid + 8];
-        sdata3[tid] += sdata3[tid + 4];
-        sdata3[tid] += sdata3[tid + 2];
-        sdata3[tid] += sdata3[tid + 1];
-
-        sdata4[tid] += sdata4[tid + 8];
-        sdata4[tid] += sdata4[tid + 4];
-        sdata4[tid] += sdata4[tid + 2];
-        sdata4[tid] += sdata4[tid + 1];
-    }
-}
-
-__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-{
-    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-    __shared__ float sdx[25];
-    __shared__ float sdy[25];
-    __shared__ float sdxabs[25];
-    __shared__ float sdyabs[25];
+        __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+            __shared__ float sdx[25];
+            __shared__ float sdy[25];
+            __shared__ float sdxabs[25];
+            __shared__ float sdyabs[25];
 
-    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-    __syncthreads();
+            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
+            __syncthreads();
 
-    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-    if (tid < 25)
-    {
-        sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
-        sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
-        __syncthreads();
+            if (tid < 25)
+            {
+                sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
+                sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
+                __syncthreads();
 
-        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-        __syncthreads();
+                reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
+                __syncthreads();
 
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
+                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
 
-        // write dx, dy, |dx|, |dy|
-        if (tid == 0)
-        {
-            descriptors_block[0] = sdx[0];
-            descriptors_block[1] = sdy[0];
-            descriptors_block[2] = sdxabs[0];
-            descriptors_block[3] = sdyabs[0];
+                // write dx, dy, |dx|, |dy|
+                if (tid == 0)
+                {
+                    descriptors_block[0] = sdx[0];
+                    descriptors_block[1] = sdy[0];
+                    descriptors_block[2] = sdxabs[0];
+                    descriptors_block[3] = sdyabs[0];
+                }
+            }
         }
-    }
-}
 
-__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-{
-    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-    __shared__ float sdx[25];
-    __shared__ float sdy[25];
+        __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        {
+            // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+            __shared__ float sdx[25];
+            __shared__ float sdy[25];
 
-    // sum (reduce) 5x5 area response
-    __shared__ float sd1[25];
-    __shared__ float sd2[25];
-    __shared__ float sdabs1[25];
-    __shared__ float sdabs2[25];
+            // sum (reduce) 5x5 area response
+            __shared__ float sd1[25];
+            __shared__ float sd2[25];
+            __shared__ float sdabs1[25];
+            __shared__ float sdabs2[25];
 
-    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
-    __syncthreads();
+            calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
+            __syncthreads();
 
-    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-    if (tid < 25)
-    {
-        if (sdy[tid] >= 0)
-        {
-            sd1[tid] = sdx[tid];
-            sdabs1[tid] = ::fabs(sdx[tid]);
-            sd2[tid] = 0;
-            sdabs2[tid] = 0;
-        }
-        else
-        {
-            sd1[tid] = 0;
-            sdabs1[tid] = 0;
-            sd2[tid] = sdx[tid];
-            sdabs2[tid] = ::fabs(sdx[tid]);
-        }
-        __syncthreads();
+            if (tid < 25)
+            {
+                if (sdy[tid] >= 0)
+                {
+                    sd1[tid] = sdx[tid];
+                    sdabs1[tid] = ::fabs(sdx[tid]);
+                    sd2[tid] = 0;
+                    sdabs2[tid] = 0;
+                }
+                else
+                {
+                    sd1[tid] = 0;
+                    sdabs1[tid] = 0;
+                    sd2[tid] = sdx[tid];
+                    sdabs2[tid] = ::fabs(sdx[tid]);
+                }
+                __syncthreads();
 
-        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        __syncthreads();
+                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+                __syncthreads();
 
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
+                float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
 
-        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-        if (tid == 0)
-        {
-            descriptors_block[0] = sd1[0];
-            descriptors_block[1] = sdabs1[0];
-            descriptors_block[2] = sd2[0];
-            descriptors_block[3] = sdabs2[0];
-        }
-        __syncthreads();
+                // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+                if (tid == 0)
+                {
+                    descriptors_block[0] = sd1[0];
+                    descriptors_block[1] = sdabs1[0];
+                    descriptors_block[2] = sd2[0];
+                    descriptors_block[3] = sdabs2[0];
+                }
+                __syncthreads();
 
-        if (sdx[tid] >= 0)
-        {
-            sd1[tid] = sdy[tid];
-            sdabs1[tid] = ::fabs(sdy[tid]);
-            sd2[tid] = 0;
-            sdabs2[tid] = 0;
-        }
-        else
-        {
-            sd1[tid] = 0;
-            sdabs1[tid] = 0;
-            sd2[tid] = sdy[tid];
-            sdabs2[tid] = ::fabs(sdy[tid]);
-        }
-        __syncthreads();
+                if (sdx[tid] >= 0)
+                {
+                    sd1[tid] = sdy[tid];
+                    sdabs1[tid] = ::fabs(sdy[tid]);
+                    sd2[tid] = 0;
+                    sdabs2[tid] = 0;
+                }
+                else
+                {
+                    sd1[tid] = 0;
+                    sdabs1[tid] = 0;
+                    sd2[tid] = sdy[tid];
+                    sdabs2[tid] = ::fabs(sdy[tid]);
+                }
+                __syncthreads();
 
-        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        __syncthreads();
+                reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+                __syncthreads();
 
-        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-        if (tid == 0)
-        {
-            descriptors_block[4] = sd1[0];
-            descriptors_block[5] = sdabs1[0];
-            descriptors_block[6] = sd2[0];
-            descriptors_block[7] = sdabs2[0];
+                // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+                if (tid == 0)
+                {
+                    descriptors_block[4] = sd1[0];
+                    descriptors_block[5] = sdabs1[0];
+                    descriptors_block[6] = sd2[0];
+                    descriptors_block[7] = sdabs2[0];
+                }
+            }
         }
-    }
-}
 
-template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
-{
-    // no need for thread ID
-    float* descriptor_base = descriptors.ptr(blockIdx.x);
-
-    // read in the unnormalized descriptor values (squared)
-    __shared__ float sqDesc[BLOCK_DIM_X];
-    const float lookup = descriptor_base[threadIdx.x];
-    sqDesc[threadIdx.x] = lookup * lookup;
-    __syncthreads();
+        template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
+        {
+            // no need for thread ID
+            float* descriptor_base = descriptors.ptr(blockIdx.x);
 
-    if (BLOCK_DIM_X >= 128)
-    {
-        if (threadIdx.x < 64)
-            sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
-        __syncthreads();
-    }
+            // read in the unnormalized descriptor values (squared)
+            __shared__ float sqDesc[BLOCK_DIM_X];
+            const float lookup = descriptor_base[threadIdx.x];
+            sqDesc[threadIdx.x] = lookup * lookup;
+            __syncthreads();
 
-    // reduction to get total
-    if (threadIdx.x < 32)
-    {
-        volatile float* smem = sqDesc;
-
-        smem[threadIdx.x] += smem[threadIdx.x + 32];
-        smem[threadIdx.x] += smem[threadIdx.x + 16];
-        smem[threadIdx.x] += smem[threadIdx.x + 8];
-        smem[threadIdx.x] += smem[threadIdx.x + 4];
-        smem[threadIdx.x] += smem[threadIdx.x + 2];
-        smem[threadIdx.x] += smem[threadIdx.x + 1];
-    }
-
-    // compute length (square root)
-    __shared__ float len;
-    if (threadIdx.x == 0)
-    {
-        len = sqrtf(sqDesc[0]);
-    }
-    __syncthreads();
+            if (BLOCK_DIM_X >= 128)
+            {
+                if (threadIdx.x < 64)
+                    sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
+                __syncthreads();
+            }
 
-    // normalize and store in output
-    descriptor_base[threadIdx.x] = lookup / len;
-}
+            // reduction to get total
+            if (threadIdx.x < 32)
+            {
+                volatile float* smem = sqDesc;
+
+                smem[threadIdx.x] += smem[threadIdx.x + 32];
+                smem[threadIdx.x] += smem[threadIdx.x + 16];
+                smem[threadIdx.x] += smem[threadIdx.x + 8];
+                smem[threadIdx.x] += smem[threadIdx.x + 4];
+                smem[threadIdx.x] += smem[threadIdx.x + 2];
+                smem[threadIdx.x] += smem[threadIdx.x + 1];
+            }
 
-void compute_descriptors_gpu(const DevMem2Df& descriptors, 
-    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
-{
-    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-    
-    if (descriptors.cols == 64)
-    {
-        compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
-        cudaSafeCall( cudaGetLastError() );
+            // compute length (square root)
+            __shared__ float len;
+            if (threadIdx.x == 0)
+            {
+                len = sqrtf(sqDesc[0]);
+            }
+            __syncthreads();
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+            // normalize and store in output
+            descriptor_base[threadIdx.x] = lookup / len;
+        }
 
-        normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
-        cudaSafeCall( cudaGetLastError() );
+        void compute_descriptors_gpu(const DevMem2Df& descriptors, 
+            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        {
+            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
+            
+            if (descriptors.cols == 64)
+            {
+                compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            
-        cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+                normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
+                cudaSafeCall( cudaGetLastError() );
 
-        normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            
-        cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+            else
+            {
+                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            
+                cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-} // namespace surf
+                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            
+                cudaSafeCall( cudaGetLastError() );
 
-END_OPENCV_DEVICE_NAMESPACE
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+    } // namespace surf
+}}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index fee4507..5cab5bb 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -71,20 +71,19 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
-
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
 
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE;
+using namespace ::cv::gpu::device;
 
 struct Stream::Impl
 {
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 0b6957d..4d44957 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -123,19 +123,18 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T, typename D> 
-void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-template <typename T, typename D> 
-void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T, typename D> 
+    void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T, typename D> 
+    void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+}}}
 
 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
@@ -174,7 +173,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
 
 void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
@@ -236,19 +235,18 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 ////////////////////////////////////////////////////////////////////////
 // subtract
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T, typename D> 
-void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-
-template <typename T, typename D> 
-void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T, typename D> 
+    void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T, typename D> 
+    void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+}}}
 
 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
@@ -287,7 +285,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
 
 void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
@@ -349,22 +347,21 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 ////////////////////////////////////////////////////////////////////////
 // multiply
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
-
-template <typename T, typename D> 
-void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
+    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
 
-template <typename T, typename D> 
-void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template <typename T, typename D> 
+    void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T, typename D> 
+    void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+}}}
 
 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
@@ -422,7 +419,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
 
 void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
@@ -472,25 +469,24 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
 ////////////////////////////////////////////////////////////////////////
 // divide
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
-
-template <typename T, typename D> 
-void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
+    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
 
-template <typename T, typename D> 
-void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+    template <typename T, typename D> 
+    void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-template <typename T, typename D> 
-void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template <typename T, typename D> 
+    void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T, typename D> 
+    void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
@@ -548,7 +544,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
 
 void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
@@ -597,7 +593,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
 
 void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
@@ -630,19 +626,18 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 //////////////////////////////////////////////////////////////////////////////
 // absdiff
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T>
-void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-template <typename T> 
-void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T>
+    void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T> 
+    void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
@@ -714,7 +709,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
 
 void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
 
@@ -758,18 +753,17 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 //////////////////////////////////////////////////////////////////////////////
 // Comparison of two matrixes
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
@@ -835,14 +829,13 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
-
-template <typename T>
-void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T>
+    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+}}}
 
 namespace
 {
@@ -850,13 +843,13 @@ namespace
     {
         dst.create(src.size(), src.type());
 
-        OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
+        ::cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
     }
 
 
     void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE;
+        using namespace ::cv::gpu::device;
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
@@ -893,24 +886,23 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-
-template <typename T>
-void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
 
-void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    template <typename T>
+    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
 
-template <typename T>
-void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
 
-void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    template <typename T>
+    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
 
-template <typename T>
-void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T>
+    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+}}}
 
 namespace
 {
@@ -919,12 +911,12 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        ::cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
     }
 
     void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE;
+        using namespace ::cv::gpu::device;
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
@@ -952,13 +944,13 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        ::cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
     }
 
 
     void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE;
+        using namespace ::cv::gpu::device;
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
@@ -986,13 +978,13 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        ::cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
     }
 
 
     void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE;
+        using namespace ::cv::gpu::device;
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
@@ -1046,21 +1038,20 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T>
-void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-
-template <typename T>
-void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
 
-template <typename T>
-void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
 
-template <typename T>
-void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+}}}
 
 namespace
 {
@@ -1069,14 +1060,14 @@ namespace
     {
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
-        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
 
     template <typename T>
     void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
     {
         dst.create(src1.size(), src1.type());
-        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        ::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
     }
     
     template <typename T>
@@ -1084,14 +1075,14 @@ namespace
     {
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
-        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
 
     template <typename T>
     void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
     {
         dst.create(src1.size(), src1.type());
-        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        ::cv::gpu::device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
     }
 }
 
@@ -1155,18 +1146,17 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // threshold
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T>
-void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T>
+    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
+}}}
 
 namespace
 {
     template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
     {
-        OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
+        ::cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
     }
 }
 
@@ -1223,16 +1213,15 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template<typename T>
-void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    template<typename T>
+    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     CV_Assert(src.depth() != CV_64F);
     dst.create(src.size(), src.type());
@@ -1252,16 +1241,15 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // addWeighted
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T1, typename T2, typename D>
-void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T1, typename T2, typename D>
+    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE;
+    using namespace ::cv::gpu::device;
 
     CV_Assert(src1.size() == src2.size());
     CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index f959e31..fb3cec4 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -735,21 +735,20 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace row_filter
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T, typename D>
-    void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}
-
-namespace column_filter
-{
-    template <typename T, typename D>
-    void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}
+    namespace row_filter
+    {
+        template <typename T, typename D>
+        void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    }
 
-END_OPENCV_DEVICE_NAMESPACE
+    namespace column_filter
+    {
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    }
+}}}
 
 namespace
 {
@@ -803,7 +802,7 @@ namespace
 
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;
+    using namespace ::cv::gpu::device::row_filter;
 
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
     
@@ -918,7 +917,7 @@ namespace
 
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;
+    using namespace ::cv::gpu::device::column_filter;
 
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
     
diff --git a/modules/gpu/src/hog.cpp b/modules/gpu/src/hog.cpp
index 2167381..25c3f47 100644
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -60,44 +60,43 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_nog
 
 #else
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace hog 
+namespace cv { namespace gpu { namespace device 
 {
-    void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
-                          int nblocks_win_x, int nblocks_win_y);
-
-    void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                       int height, int width, const cv::gpu::DevMem2Df& grad, 
-                       const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
-
-    void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
-                         int height, int width, float* block_hists, float threshold);
-
-    void classify_hists(int win_height, int win_width, int block_stride_y, 
-                        int block_stride_x, int win_stride_y, int win_stride_x, int height, 
-                        int width, float* block_hists, float* coefs, float free_coef, 
-                        float threshold, unsigned char* labels);
-
-    void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                                cv::gpu::DevMem2Df descriptors);
-    void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                                cv::gpu::DevMem2Df descriptors);
-
-    void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
-                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
-    void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
-                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
-
-    void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
-    void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace hog 
+    {
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
+                              int nblocks_win_x, int nblocks_win_y);
+
+        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
+                           int height, int width, const cv::gpu::DevMem2Df& grad, 
+                           const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
+
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
+                             int height, int width, float* block_hists, float threshold);
+
+        void classify_hists(int win_height, int win_width, int block_stride_y, 
+                            int block_stride_x, int win_stride_y, int win_stride_x, int height, 
+                            int width, float* block_hists, float* coefs, float free_coef, 
+                            float threshold, unsigned char* labels);
+
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                                    cv::gpu::DevMem2Df descriptors);
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                                    cv::gpu::DevMem2Df descriptors);
+
+        void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
+                                    float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
+        void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
+                                    float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
+
+        void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+        void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE;
+using namespace ::cv::gpu::device;
     
 cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, 
 									  int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 7d171c6..6515f0a 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -107,20 +107,19 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 ////////////////////////////////////////////////////////////////////////
 // remap
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T> 
-    void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, 
-                   int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        template <typename T> 
+        void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, 
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    }
+}}}
 
 void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
         int borderMode, const float* borderValue, cudaStream_t stream, int cc);
@@ -160,18 +159,17 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
 ////////////////////////////////////////////////////////////////////////
 // meanShiftFiltering_GPU
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     if( src.empty() )
         CV_Error( CV_StsBadArg, "The input image is empty" );
@@ -197,18 +195,17 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
 ////////////////////////////////////////////////////////////////////////
 // meanShiftProc_GPU
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     if( src.empty() )
         CV_Error( CV_StsBadArg, "The input image is empty" );
@@ -235,22 +232,21 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
 ////////////////////////////////////////////////////////////////////////
 // drawColorDisp
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
-    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
+        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
+    }
+}}}
 
 namespace
 {
     template <typename T>
     void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+        using namespace ::cv::gpu::device::imgproc;
 
         dst.create(src.size(), CV_8UC4);
 
@@ -272,22 +268,21 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
 ////////////////////////////////////////////////////////////////////////
 // reprojectImageTo3D
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
-    void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+        void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+    }
+}}}
 
 namespace
 {
     template <typename T>
     void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+        using namespace ::cv::gpu::device::imgproc;
 
         xyzw.create(disp.rows, disp.cols, CV_32FC4);
 
@@ -309,14 +304,13 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
 ////////////////////////////////////////////////////////////////////////
 // resize
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
@@ -380,7 +374,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
     }
     else
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+        using namespace ::cv::gpu::device::imgproc;
 
         typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
         static const caller_t callers[6][4] = 
@@ -400,20 +394,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
 ////////////////////////////////////////////////////////////////////////
 // copyMakeBorder
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
+    }
+}}}
 
 namespace
 {
     template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+        using namespace ::cv::gpu::device::imgproc;
 
         Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
 
@@ -666,21 +659,20 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpPlaneMaps
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                            const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
-                            cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
+                                cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, 
                                  float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
     CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
@@ -700,21 +692,20 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpCylyndricalMaps
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                                  const float k_rinv[9], const float r_kinv[9], float scale,
-                                  cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                        GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
     CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
@@ -733,21 +724,20 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpSphericalMaps
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                                const float k_rinv[9], const float r_kinv[9], float scale,
-                                cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                      GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
     CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
@@ -899,18 +889,17 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
 //////////////////////////////////////////////////////////////////////////////
 // columnSum
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc
+namespace cv { namespace gpu { namespace device 
 {
-    void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc
+    {
+        void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
+    }
+}}}
 
 void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(src.type() == CV_32F);
 
@@ -1245,19 +1234,18 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
     hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
 }
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace hist
+namespace cv { namespace gpu { namespace device 
 {
-    void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
-
-    const int PARTIAL_HISTOGRAM256_COUNT = 240;
-    const int HISTOGRAM256_BIN_COUNT     = 256;
+    namespace hist
+    {
+        void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
 
-    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
-}
+        const int PARTIAL_HISTOGRAM256_COUNT = 240;
+        const int HISTOGRAM256_BIN_COUNT     = 256;
 
-END_OPENCV_DEVICE_NAMESPACE
+        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
 {
@@ -1267,7 +1255,7 @@ void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
 
 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ hist;
+    using namespace ::cv::gpu::device::hist;
 
     CV_Assert(src.type() == CV_8UC1);
 
@@ -1293,7 +1281,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
 
 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ hist;
+    using namespace ::cv::gpu::device::hist;
 
     CV_Assert(src.type() == CV_8UC1);
 
@@ -1327,16 +1315,15 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 ////////////////////////////////////////////////////////////////////////
 // cornerHarris & minEgenVal
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
+        void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
+        void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
+    }
+}}}
 
 namespace 
 {
@@ -1421,7 +1408,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
 
 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||
               borderType == cv::BORDER_REPLICATE);
@@ -1448,7 +1435,7 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 
 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
 {  
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     CV_Assert(borderType == cv::BORDER_REFLECT101 ||
               borderType == cv::BORDER_REPLICATE);
@@ -1464,20 +1451,19 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
 //////////////////////////////////////////////////////////////////////////////
 // mulSpectrums
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-}
+    namespace imgproc 
+    {
+        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) 
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
 
@@ -1495,20 +1481,19 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
 //////////////////////////////////////////////////////////////////////////////
 // mulAndScaleSpectrums
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-}
+    namespace imgproc 
+    {
+        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) 
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
     static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };
@@ -1673,18 +1658,17 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     convolve(image, templ, result, ccorr, buf);
 }
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc
+namespace cv { namespace gpu { namespace device 
 {
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc
+    {
+        void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
 #ifndef HAVE_CUFFT
 
@@ -1811,18 +1795,17 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
@@ -1851,18 +1834,17 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st
 //////////////////////////////////////////////////////////////////////////////
 // pyrUp
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace imgproc 
+    {
+        template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+    using namespace ::cv::gpu::device::imgproc;
 
     typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
 
@@ -1933,31 +1915,30 @@ void cv::gpu::CannyBuf::release()
     trackBuf2.release();
 }
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace canny 
+namespace cv { namespace gpu { namespace device 
 {
-    void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
-
-    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
-    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+    namespace canny 
+    {
+        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
 
-    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
-    
-    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
 
-    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
+        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+        
+        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
 
-    void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
-}
+        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
+    }
+}}}
 
 namespace
 {
     void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ canny;
+        using namespace ::cv::gpu::device::canny;
 
         calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
         
@@ -1977,7 +1958,7 @@ void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double hi
 
 void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ canny;
+    using namespace ::cv::gpu::device::canny;
 
     CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
     CV_Assert(src.type() == CV_8UC1);
@@ -2016,7 +1997,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_
 
 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ canny;
+    using namespace ::cv::gpu::device::canny;
 
     CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
     CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
diff --git a/modules/gpu/src/initialization.cpp b/modules/gpu/src/initialization.cpp
index e93c899..49c185c 100644
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@@ -274,18 +274,17 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
 ////////////////////////////////////////////////////////////////////
 // GpuFuncTable
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
-
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+namespace cv { namespace gpu { namespace device 
+{
+    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
 
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+}}}
 
 namespace
 {
@@ -345,7 +344,7 @@ namespace
 
     void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
     {
-        OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
+        ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -403,7 +402,7 @@ namespace
     void kernelSet(GpuMat& src, Scalar s)
     {
         Scalar_<T> sf = s;
-        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);
+        ::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), 0);
     }
 
     template<int SDEPTH, int SCN> struct NppSetMaskFunc
@@ -458,7 +457,7 @@ namespace
     void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
     {
         Scalar_<T> sf = s;
-        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);
+        ::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
     }
 
     class CudaFuncTable : public GpuFuncTable
@@ -479,7 +478,7 @@ namespace
 
         void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const 
         { 
-            OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
+            ::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
         }
 
         void convert(const GpuMat& src, GpuMat& dst) const 
@@ -560,7 +559,7 @@ namespace
 
         void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const 
         { 
-            device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
+            ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
         }
 
         void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp
index 4e0b4bf..7066586 100644
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -52,94 +52,93 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)
 
 #else
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace match_template 
+namespace cv { namespace gpu { namespace device 
 {
-    void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-    void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-
-    void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-    void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
-
-    void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
-        int cn, cudaStream_t stream);
-
-    void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
-        int cn, cudaStream_t stream);
-
-    void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_8UC2(
-        int w, int h,
-        const DevMem2D_<unsigned int> image_sum_r, 
-        const DevMem2D_<unsigned int> image_sum_g, 
-        unsigned int templ_sum_r,
-        unsigned int templ_sum_g, 
-        DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_8UC3(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g,
-            const DevMem2D_<unsigned int> image_sum_b,
-            unsigned int templ_sum_r, 
-            unsigned int templ_sum_g, 
-            unsigned int templ_sum_b, 
-            DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_8UC4(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g,
-            const DevMem2D_<unsigned int> image_sum_b,
-            const DevMem2D_<unsigned int> image_sum_a,
-            unsigned int templ_sum_r, 
-            unsigned int templ_sum_g, 
-            unsigned int templ_sum_b, 
-            unsigned int templ_sum_a, 
-            DevMem2Df result, cudaStream_t stream);
-
+    namespace match_template 
+    {
+        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
 
-    void matchTemplatePrepared_CCOFF_NORMED_8U(
-            int w, int h, const DevMem2D_<unsigned int> image_sum, 
-            const DevMem2D_<unsigned long long> image_sqsum,
-            unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result, cudaStream_t stream);
-    void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-            int w, int h, 
-            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
-            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
-            const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
-            const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
-            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
-            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
 
-    void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                      unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+            int cn, cudaStream_t stream);
 
-    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
-}
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+            int cn, cudaStream_t stream);
 
-END_OPENCV_DEVICE_NAMESPACE
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC2(
+            int w, int h,
+            const DevMem2D_<unsigned int> image_sum_r, 
+            const DevMem2D_<unsigned int> image_sum_g, 
+            unsigned int templ_sum_r,
+            unsigned int templ_sum_g, 
+            DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, 
+                const DevMem2D_<unsigned int> image_sum_g,
+                const DevMem2D_<unsigned int> image_sum_b,
+                unsigned int templ_sum_r, 
+                unsigned int templ_sum_g, 
+                unsigned int templ_sum_b, 
+                DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, 
+                const DevMem2D_<unsigned int> image_sum_g,
+                const DevMem2D_<unsigned int> image_sum_b,
+                const DevMem2D_<unsigned int> image_sum_a,
+                unsigned int templ_sum_r, 
+                unsigned int templ_sum_g, 
+                unsigned int templ_sum_b, 
+                unsigned int templ_sum_a, 
+                DevMem2Df result, cudaStream_t stream);
+
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                const DevMem2D_<unsigned long long> image_sqsum,
+                unsigned int templ_sum, unsigned int templ_sqsum,
+                DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned int templ_sqsum_b,
+                DevMem2Df result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                int w, int h, 
+                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
+                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
+                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
+                const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
+                unsigned int templ_sum_r, unsigned int templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned int templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned int templ_sqsum_b,
+                unsigned int templ_sum_a, unsigned int templ_sqsum_a,
+                DevMem2Df result, cudaStream_t stream);
+
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+                          unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
+
+        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE_ match_template;
+using namespace ::cv::gpu::device::match_template;
 
 namespace 
 {
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 3450bd8..71ce0b7 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -190,35 +190,34 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace matrix_reductions 
+namespace cv { namespace gpu { namespace device 
 {
-    namespace sum
+    namespace matrix_reductions 
     {
-        template <typename T>
-        void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+        namespace sum
+        {
+            template <typename T>
+            void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        template <typename T>
-        void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            template <typename T>
+            void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        template <typename T>
-        void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            template <typename T>
+            void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        template <typename T>
-        void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            template <typename T>
+            void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        template <typename T>
-        void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            template <typename T>
+            void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        template <typename T>
-        void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+            template <typename T>
+            void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
 
-        void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
+            void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
+        }
     }
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+}}}
 
 Scalar cv::gpu::sum(const GpuMat& src) 
 {
@@ -229,7 +228,7 @@ Scalar cv::gpu::sum(const GpuMat& src)
 
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
+    using namespace ::cv::gpu::device::matrix_reductions::sum;
 
     typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
 
@@ -272,7 +271,7 @@ Scalar cv::gpu::absSum(const GpuMat& src)
 
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
+    using namespace ::cv::gpu::device::matrix_reductions::sum;
 
     typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
 
@@ -316,7 +315,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
 
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
+    using namespace ::cv::gpu::device::matrix_reductions::sum;
 
     typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
 
@@ -353,29 +352,28 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 ////////////////////////////////////////////////////////////////////////
 // Find min or max
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace matrix_reductions 
+namespace cv { namespace gpu { namespace device 
 {
-    namespace minmax 
+    namespace matrix_reductions 
     {
-        void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-        
-        template <typename T> 
-        void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+        namespace minmax 
+        {
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
+            
+            template <typename T> 
+            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
 
-        template <typename T> 
-        void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+            template <typename T> 
+            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 
-        template <typename T> 
-        void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+            template <typename T> 
+            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
 
-        template <typename T> 
-        void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+            template <typename T> 
+            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+        }
     }
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+}}}
 
 
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
@@ -387,7 +385,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;
+    using namespace ::cv::gpu::device::matrix_reductions::minmax;
 
     typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
     typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
@@ -457,34 +455,33 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 ////////////////////////////////////////////////////////////////////////
 // Locate min and max
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace matrix_reductions 
+namespace cv { namespace gpu { namespace device 
 {
-    namespace minmaxloc 
+    namespace matrix_reductions 
     {
-        void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
-                                int& b1rows, int& b2cols, int& b2rows);
-
-        template <typename T> 
-        void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
-                             int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+        namespace minmaxloc 
+        {
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+                                    int& b1rows, int& b2cols, int& b2rows);
 
-        template <typename T> 
-        void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            template <typename T> 
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
                                  int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
 
-        template <typename T> 
-        void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
-                                      int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+            template <typename T> 
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
 
-        template <typename T> 
-        void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            template <typename T> 
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
                                           int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-    }
-}
 
-END_OPENCV_DEVICE_NAMESPACE
+            template <typename T> 
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+        }
+    }
+}}}
 
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
 {    
@@ -495,7 +492,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                         const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;
+    using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
 
     typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
     typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
@@ -571,23 +568,22 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 //////////////////////////////////////////////////////////////////////////////
 // Count non-zero elements
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace matrix_reductions 
+namespace cv { namespace gpu { namespace device 
 {
-    namespace countnonzero 
+    namespace matrix_reductions 
     {
-        void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
+        namespace countnonzero 
+        {
+            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
 
-        template <typename T> 
-        int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
+            template <typename T> 
+            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
 
-        template <typename T> 
-        int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
+            template <typename T> 
+            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
+        }
     }
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+}}}
 
 int cv::gpu::countNonZero(const GpuMat& src)
 {
@@ -598,7 +594,7 @@ int cv::gpu::countNonZero(const GpuMat& src)
 
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;
+    using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
 
     typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);
 
@@ -632,19 +628,19 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 
 //////////////////////////////////////////////////////////////////////////////
 // reduce
-BEGIN_OPENCV_DEVICE_NAMESPACE
 
-namespace matrix_reductions 
+namespace cv { namespace gpu { namespace device 
 {
-    template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace matrix_reductions 
+    {
+        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+    }
+}}}
 
 void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;
+    using namespace ::cv::gpu::device::matrix_reductions;
 
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
     CV_Assert(dim == 0 || dim == 1);
diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
index bf952d8..5bfd2a7 100644
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -48,669 +48,668 @@
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-//////////////////////////////////////////////////////////////
-// BrdConstant
-
-template <typename D> struct BrdRowConstant
-{
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
-
-    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
-    {
-        return x >= 0 ? saturate_cast<D>(data[x]) : val;
-    }
-
-    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
-    {
-        return x < width ? saturate_cast<D>(data[x]) : val;
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
-    {
-        return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
-    }
-
-    const int width;
-    const D val;
-};
-
-template <typename D> struct BrdColConstant
-{
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
-
-    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
-    {
-        return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
-    }
-
-    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
-    {
-        return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
-    {
-        return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
-    }
-
-    const int height;
-    const D val;
-};
-
-template <typename D> struct BrdConstant
-{
-    typedef D result_type;
-
-    __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) 
-    {
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
-    {
-        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
-    }
-
-    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
-    {
-        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
-    }
-
-    const int height;
-    const int width;
-    const D val;
-};
-
-//////////////////////////////////////////////////////////////
-// BrdReplicate
-
-template <typename D> struct BrdRowReplicate
-{
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
-
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return ::max(x, 0);
-    }
-
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return ::min(x, last_col);
-    }
-
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_low(idx_col_high(x));
-    }
-
-    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_low(x)]);
-    }
-
-    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_high(x)]);
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col(x)]);
-    }
-
-    const int last_col;
-};
-
-template <typename D> struct BrdColReplicate
-{
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
-
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return ::max(y, 0);
-    }
-
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return ::min(y, last_row);
-    }
-
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_low(idx_row_high(y));
-    }
-
-    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
-    }
-
-    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
-    }
-
-    const int last_row;
-};
-
-template <typename D> struct BrdReplicate
-{
-    typedef D result_type;
-
-    __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
-
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return ::max(y, 0);
-    }
-
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return ::min(y, last_row);
-    }
-
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_low(idx_row_high(y));
-    }
-
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return ::max(x, 0);
-    }
-
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return ::min(x, last_col);
-    }
-
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_low(idx_col_high(x));
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
-    }
-
-    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
-    {
-        return saturate_cast<D>(src(idx_row(y), idx_col(x)));
-    }
-
-    const int last_row;
-    const int last_col;
-};
-
-//////////////////////////////////////////////////////////////
-// BrdReflect101
-
-template <typename D> struct BrdRowReflect101
+namespace cv { namespace gpu { namespace device 
 {
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
 
-    __device__ __forceinline__ int idx_col_low(int x) const
+    template <typename D> struct BrdRowConstant
     {
-        return ::abs(x) % (last_col + 1);
-    }
+        typedef D result_type;
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
-    }
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
 
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_low(idx_col_high(x));
-    }
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
+        }
 
-    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_low(x)]);
-    }
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return x < width ? saturate_cast<D>(data[x]) : val;
+        }
 
-    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_high(x)]);
-    }
-
-    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col(x)]);
-    }
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
+        }
 
-    const int last_col;
-};
-
-template <typename D> struct BrdColReflect101
-{
-    typedef D result_type;
+        const int width;
+        const D val;
+    };
 
-    explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
-
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return ::abs(y) % (last_row + 1);
-    }
-
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
-    }
-
-    __device__ __forceinline__ int idx_row(int y) const
+    template <typename D> struct BrdColConstant
     {
-        return idx_row_low(idx_row_high(y));
-    }
+        typedef D result_type;
 
-    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
-    }
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
 
-    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
-    }
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
 
-    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
-    }
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
 
-    const int last_row;
-};
-
-template <typename D> struct BrdReflect101
-{
-    typedef D result_type;
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
 
-    __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+        const int height;
+        const D val;
+    };
 
-    __device__ __forceinline__ int idx_row_low(int y) const
+    template <typename D> struct BrdConstant
     {
-        return ::abs(y) % (last_row + 1);
-    }
+        typedef D result_type;
 
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
-    }
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) 
+        {
+        }
 
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_low(idx_row_high(y));
-    }
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
+        }
 
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return ::abs(x) % (last_col + 1);
-    }
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
-    }
+        const int height;
+        const int width;
+        const D val;
+    };
 
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_low(idx_col_high(x));
-    }
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
 
-    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+    template <typename D> struct BrdRowReplicate
     {
-        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
-    }
+        typedef D result_type;
 
-    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
-    {
-        return saturate_cast<D>(src(idx_row(y), idx_col(x)));
-    }
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
 
-    const int last_row;
-    const int last_col;
-};
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
 
-//////////////////////////////////////////////////////////////
-// BrdReflect
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::min(x, last_col);
+        }
 
-template <typename D> struct BrdRowReflect
-{
-    typedef D result_type;
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
 
-    explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
 
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return (::abs(x) - (x < 0)) % (last_col + 1);
-    }
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
-    }
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
 
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_low(idx_col_high(x));
-    }
+        const int last_col;
+    };
 
-    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+    template <typename D> struct BrdColReplicate
     {
-        return saturate_cast<D>(data[idx_col_low(x)]);
-    }
+        typedef D result_type;
 
-    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_high(x)]);
-    }
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
 
-    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col(x)]);
-    }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
 
-    const int last_col;
-};
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::min(y, last_row);
+        }
 
-template <typename D> struct BrdColReflect
-{
-    typedef D result_type;
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
 
-    explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
 
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return (::abs(y) - (y < 0)) % (last_row + 1);
-    }
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
+        }
 
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
-    }
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
 
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_low(idx_row_high(y));
-    }
+        const int last_row;
+    };
 
-    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+    template <typename D> struct BrdReplicate
     {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
-    }
+        typedef D result_type;
 
-    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
-    }
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
 
-    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
-    }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
 
-    const int last_row;
-};
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::min(y, last_row);
+        }
 
-template <typename D> struct BrdReflect
-{
-    typedef D result_type;
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
 
-    __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
 
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return (::abs(y) - (y < 0)) % (last_row + 1);
-    }
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
 
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
-    }
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
 
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_low(idx_row_high(y));
-    }
-
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return (::abs(x) - (x < 0)) % (last_col + 1);
-    }
+        const int last_row;
+        const int last_col;
+    };
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return /*::abs*/(last_col - ::abs(last_col - x) + (x > last_col)) /*% (last_col + 1)*/;
-    }
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
 
-    __device__ __forceinline__ int idx_col(int x) const
+    template <typename D> struct BrdRowReflect101
     {
-        return idx_col_low(idx_col_high(x));
-    }
+        typedef D result_type;
 
-    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
-    }
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
 
-    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
-    {
-        return saturate_cast<D>(src(idx_row(y), idx_col(x)));
-    }
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
 
-    const int last_row;
-    const int last_col;
-};
+        const int last_col;
+    };
 
-//////////////////////////////////////////////////////////////
-// BrdWrap
-
-template <typename D> struct BrdRowWrap
-{
-    typedef D result_type;
-
-    explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
-
-    __device__ __forceinline__ int idx_col_low(int x) const
+    template <typename D> struct BrdColReflect101
     {
-        return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
-    }
+        typedef D result_type;
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return (x < width) * x + (x >= width) * (x % width);
-    }
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
 
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_high(idx_col_low(x));
-    }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
 
-    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col_low(x)]);
-    }
+        const int last_row;
+    };
 
-    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+    template <typename D> struct BrdReflect101
     {
-        return saturate_cast<D>(data[idx_col_high(x)]);
-    }
+        typedef D result_type;
 
-    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
-    {
-        return saturate_cast<D>(data[idx_col(x)]);
-    }
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
 
-    const int width;
-};
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
 
-template <typename D> struct BrdColWrap
-{
-    typedef D result_type;
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
 
-    explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
-    template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
 
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
-    }
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
 
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return (y < height) * y + (y >= height) * (y % height);
-    }
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
 
-    __device__ __forceinline__ int idx_row(int y) const
-    {
-        return idx_row_high(idx_row_low(y));
-    }
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
 
-    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
-    }
+        const int last_row;
+        const int last_col;
+    };
 
-    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
-    }
+    //////////////////////////////////////////////////////////////
+    // BrdReflect
 
-    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+    template <typename D> struct BrdRowReflect
     {
-        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
-    }
+        typedef D result_type;
 
-    const int height;
-};
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
 
-template <typename D> struct BrdWrap
-{
-    typedef D result_type;
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
 
-    __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : 
-        height(height_), width(width_) 
-    {
-    }
-    template <typename U> 
-    __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : 
-        height(height_), width(width_) 
-    {
-    }
-
-    __device__ __forceinline__ int idx_row_low(int y) const
-    {
-        return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
-    }
-
-    __device__ __forceinline__ int idx_row_high(int y) const 
-    {
-        return (y < height) * y + (y >= height) * (y % height);
-    }
+        const int last_col;
+    };
 
-    __device__ __forceinline__ int idx_row(int y) const
+    template <typename D> struct BrdColReflect
     {
-        return idx_row_high(idx_row_low(y));
-    }
+        typedef D result_type;
 
-    __device__ __forceinline__ int idx_col_low(int x) const
-    {
-        return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
-    }
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
 
-    __device__ __forceinline__ int idx_col_high(int x) const 
-    {
-        return (x < width) * x + (x >= width) * (x % width);
-    }
-
-    __device__ __forceinline__ int idx_col(int x) const
-    {
-        return idx_col_high(idx_col_low(x));
-    }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
 
-    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
-    {
-        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
-    }
+        const int last_row;
+    };
 
-    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+    template <typename D> struct BrdReflect
     {
-        return saturate_cast<D>(src(idx_row(y), idx_col(x)));
-    }
-
-    const int height;
-    const int width;
-};
-
-//////////////////////////////////////////////////////////////
-// BorderReader
-
-template <typename Ptr2D, typename B> struct BorderReader
-{
-    typedef typename B::result_type elem_type;
-    typedef typename Ptr2D::index_type index_type;
+        typedef D result_type;
 
-    __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
 
-    __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
-    {
-        return b.at(y, x, ptr);
-    }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
 
-    const Ptr2D ptr;
-    const B b;
-};
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
+        }
 
-// under win32 there is some bug with templated types that passed as kernel parameters
-// with this specialization all works fine
-template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
-{
-    typedef typename BrdConstant<D>::result_type elem_type;
-    typedef typename Ptr2D::index_type index_type;
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
 
-    __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : 
-        src(src_), height(b.height), width(b.width), val(b.val) 
-    {
-    }
-
-    __device__ __forceinline__ D operator ()(index_type y, index_type x) const
-    {
-        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
-    }
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return (last_col - ::abs(last_col - x) + (x > last_col));
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
 
-    const Ptr2D src;
-    const int height;
-    const int width;
-    const D val;
-};
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        const int last_row;
+        const int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        const int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        const int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : 
+            height(height_), width(width_) 
+        {
+        }
+        template <typename U> 
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : 
+            height(height_), width(width_) 
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        const int height;
+        const int width;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
+        {
+            return b.at(y, x, ptr);
+        }
+
+        const Ptr2D ptr;
+        const B b;
+    };
+
+    // under win32 there is some bug with templated types that passed as kernel parameters
+    // with this specialization all works fine
+    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
+    {
+        typedef typename BrdConstant<D>::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : 
+            src(src_), height(b.height), width(b.width), val(b.val) 
+        {
+        }
+
+        __device__ __forceinline__ D operator ()(index_type y, index_type x) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        const Ptr2D src;
+        const int height;
+        const int width;
+        const D val;
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/color.hpp b/modules/gpu/src/opencv2/gpu/device/color.hpp
index 16b108a..c012fe7 100644
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@@ -46,178 +46,177 @@
 #include "internal_shared.hpp"
 #include "detail/color_detail.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
-// template <typename T> class ColorSpace1_to_ColorSpace2_traits
-// {
-//     typedef ... functor_type;
-//     static __host__ __device__ functor_type create_functor();
-// };
-
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
-OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
-OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
-OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
-
-#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
-OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
-
-#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
-OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
-
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
-
-#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
-
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
-
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
-
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
-
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
-
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
-OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
-
-#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
index e29f003..8bdc5bf 100644
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -45,16 +45,8 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#if defined(_WIN64) || defined(__LP64__)		
-    // 64-bit register modifier for inlined asm
-    #define OPENCV_GPU_ASM_PTR "l"
-#else	
-    // 32-bit register modifier for inlined asm
-    #define OPENCV_GPU_ASM_PTR "r"
-#endif
-
+namespace cv { namespace gpu { namespace device 
+{
     #if __CUDA_ARCH__ >= 200
 
         // for Fermi memory space is detected automatically
@@ -65,6 +57,14 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
             
     #else // __CUDA_ARCH__ >= 200        
 
+        #if defined(_WIN64) || defined(__LP64__)		
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_GPU_ASM_PTR "l"
+        #else	
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_GPU_ASM_PTR "r"
+        #endif
+
         template<class T> struct ForceGlob;
 
         #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
@@ -85,21 +85,21 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
                 } \
             };
         
-        OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
-        OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
-        OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
-        OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            
 
-    #undef OPENCV_GPU_DEFINE_FORCE_GLOB
-    #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
+        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_GPU_ASM_PTR
         
     #endif // __CUDA_ARCH__ >= 200
-
-END_OPENCV_DEVICE_NAMESPACE
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
index 6ccc45c..7dd4e8d 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
@@ -49,93 +49,93 @@
 #include "../limits.hpp"
 #include "../functional.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#ifndef CV_DESCALE
-    #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
-#endif
-
-namespace detail
+namespace cv { namespace gpu { namespace device
 {
-    template<typename T> struct ColorChannel
-    {
-        typedef float worktype_f;
-        static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }
-        static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
-    };
-    template<> struct ColorChannel<float>
-    {
-        typedef float worktype_f;
-        static __device__ __forceinline__ float max() { return 1.f; }
-        static __device__ __forceinline__ float half() { return 0.5f; }
-    };
+    #ifndef CV_DESCALE
+        #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+    #endif
 
-    template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)
-    {
-    }
-    template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)
-    {
-        vec.w = val;
-    }
-    template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)
-    {
-        return ColorChannel<T>::max();
-    }
-    template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)
+    namespace color_detail
     {
-        return vec.w;
-    }
+        template<typename T> struct ColorChannel
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }
+            static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
+        };
+        template<> struct ColorChannel<float>
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ float max() { return 1.f; }
+            static __device__ __forceinline__ float half() { return 0.5f; }
+        };
 
-    enum
-    {
-        yuv_shift  = 14,
-        xyz_shift  = 12,
-        R2Y        = 4899,
-        G2Y        = 9617,
-        B2Y        = 1868,
-        BLOCK_SIZE = 256
-    };
-}
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)
+        {
+        }
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)
+        {
+            vec.w = val;
+        }
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)
+        {
+            return ColorChannel<T>::max();
+        }
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)
+        {
+            return vec.w;
+        }
+
+        enum
+        {
+            yuv_shift  = 14,
+            xyz_shift  = 12,
+            R2Y        = 4899,
+            G2Y        = 9617,
+            B2Y        = 1868,
+            BLOCK_SIZE = 256
+        };
+    }
 
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 
-namespace detail
-{
-    template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+    namespace color_detail
     {
-        __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+        template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
 
-            dst.x = (&src.x)[bidx];
-            dst.y = src.y;
-            dst.z = (&src.x)[bidx^2];
-            setAlpha(dst, getAlpha<T>(src));
+                dst.x = (&src.x)[bidx];
+                dst.y = src.y;
+                dst.z = (&src.x)[bidx^2];
+                setAlpha(dst, getAlpha<T>(src));
 
-            return dst;
-        }
-    };
+                return dst;
+            }
+        };
 
-    template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
-    {
-        __device__ uint operator()(uint src) const
+        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
         {
-            uint dst = 0;
-
-            dst |= (0xffu & (src >> 16));
-            dst |= (0xffu & (src >> 8)) << 8;
-            dst |= (0xffu & (src)) << 16;
-            dst |= (0xffu & (src >> 24)) << 24;
-
-            return dst;
-        }
-    };
-}
+            __device__ uint operator()(uint src) const
+            {
+                uint dst = 0;
+
+                dst |= (0xffu & (src >> 16));
+                dst |= (0xffu & (src >> 8)) << 8;
+                dst |= (0xffu & (src)) << 16;
+                dst |= (0xffu & (src >> 24)) << 24;
+
+                return dst;
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -144,130 +144,130 @@ namespace detail
 
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
 
-namespace detail
-{
-    template <int green_bits, int bidx> struct RGB2RGB5x5Converter;
-    template<int bidx> struct RGB2RGB5x5Converter<6, bidx> 
-    {
-        static __device__ __forceinline__ ushort cvt(const uchar3& src)
-        {
-            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
-        }
-        static __device__ __forceinline__ ushort cvt(uint src)
-        {
-            uint b = 0xffu & (src >> (bidx * 8));
-            uint g = 0xffu & (src >> 8);
-            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
-            return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));
-        }
-    };
-    template<int bidx> struct RGB2RGB5x5Converter<5, bidx> 
+    namespace color_detail
     {
-        static __device__ __forceinline__ ushort cvt(const uchar3& src)
+        template <int green_bits, int bidx> struct RGB2RGB5x5Converter;
+        template<int bidx> struct RGB2RGB5x5Converter<6, bidx> 
         {
-            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
-        }
-        static __device__ __forceinline__ ushort cvt(uint src)
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
+            }
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));
+            }
+        };
+        template<int bidx> struct RGB2RGB5x5Converter<5, bidx> 
         {
-            uint b = 0xffu & (src >> (bidx * 8));
-            uint g = 0xffu & (src >> 8);
-            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
-            uint a = 0xffu & (src >> 24);
-            return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));
-        }
-    };
-
-    template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;
-    template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>
-    {
-        __device__ __forceinline__ ushort operator()(const uchar3& src) const
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
+            }
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                uint a = 0xffu & (src >> 24);
+                return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));
+            }
+        };
+
+        template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;
+        template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>
         {
-            return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
-        }
-    };
-    template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
-    {
-        __device__ __forceinline__ ushort operator()(uint src) const
+            __device__ __forceinline__ ushort operator()(const uchar3& src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+        };
+        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
         {
-            return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
-        }
-    };
-}
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
     struct name ## _traits \
     { \
-        typedef detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    template <int green_bits, int bidx> struct RGB5x52RGBConverter;    
-    template <int bidx> struct RGB5x52RGBConverter<5, bidx>
+    namespace color_detail
     {
-        static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
-        {            
-            (&dst.x)[bidx] = src << 3;
-            dst.y = (src >> 2) & ~7;
-            (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;
-        }
-        static __device__ __forceinline__ void cvt(uint src, uint& dst)
-        {   
-            dst = 0;
-
-            dst |= (0xffu & (src << 3)) << (bidx * 8);
-            dst |= (0xffu & ((src >> 2) & ~7)) << 8;
-            dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);
-            dst |= ((src & 0x8000) * 0xffu) << 24;
-        }
-    };
-    template <int bidx> struct RGB5x52RGBConverter<6, bidx>
-    {
-        static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
-        {            
-            (&dst.x)[bidx] = src << 3;
-            dst.y = (src >> 3) & ~3;
-            (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;
-        }
-        static __device__ __forceinline__ void cvt(uint src, uint& dst)
-        {           
-            dst = 0xffu << 24;
-
-            dst |= (0xffu & (src << 3)) << (bidx * 8);
-            dst |= (0xffu &((src >> 3) & ~3)) << 8;
-            dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);
-        }
-    };
-
-    template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;
-    template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>
-    {
-        __device__ __forceinline__ uchar3 operator()(ushort src) const
+        template <int green_bits, int bidx> struct RGB5x52RGBConverter;    
+        template <int bidx> struct RGB5x52RGBConverter<5, bidx>
         {
-            uchar3 dst;
-            RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
-            return dst;
-        }
-    };
-    template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>
-    {
-        __device__ __forceinline__ uint operator()(ushort src) const
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {            
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 2) & ~7;
+                (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;
+            }
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {   
+                dst = 0;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu & ((src >> 2) & ~7)) << 8;
+                dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);
+                dst |= ((src & 0x8000) * 0xffu) << 24;
+            }
+        };
+        template <int bidx> struct RGB5x52RGBConverter<6, bidx>
         {
-            uint dst;
-            RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
-            return dst;
-        }
-    };
-}
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {            
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 3) & ~3;
+                (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;
+            }
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {           
+                dst = 0xffu << 24;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu &((src >> 3) & ~3)) << 8;
+                dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);
+            }
+        };
+
+        template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;
+        template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>
+        {
+            __device__ __forceinline__ uchar3 operator()(ushort src) const
+            {
+                uchar3 dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+        };
+        template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>
+        {
+            __device__ __forceinline__ uint operator()(ushort src) const
+            {
+                uint dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
     struct name ## _traits \
     { \
-        typedef detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -276,77 +276,77 @@ namespace detail
 
 ///////////////////////////////// Grayscale to Color ////////////////////////////////
 
-namespace detail
-{
-    template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>
+    namespace color_detail
     {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const
+        template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
 
-            dst.z = dst.y = dst.x = src;            
-            setAlpha(dst, ColorChannel<T>::max());
+                dst.z = dst.y = dst.x = src;            
+                setAlpha(dst, ColorChannel<T>::max());
 
-            return dst;
-        }
-    };
-    template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+                return dst;
+            }
+        };
+        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
         {
-            uint dst = 0xffu << 24;
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                uint dst = 0xffu << 24;
 
-            dst |= src;
-            dst |= src << 8;
-            dst |= src << 16;
+                dst |= src;
+                dst |= src << 8;
+                dst |= src << 16;
 
-            return dst;
-        }
-    };
-}
+                return dst;
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::Gray2RGB<T, dcn> functor_type; \
+        typedef ::cv::gpu::device::color_detail::Gray2RGB<T, dcn> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    template <int green_bits> struct Gray2RGB5x5Converter;
-    template<> struct Gray2RGB5x5Converter<6> 
+    namespace color_detail
     {
-        static __device__ __forceinline__ ushort cvt(uint t)
+        template <int green_bits> struct Gray2RGB5x5Converter;
+        template<> struct Gray2RGB5x5Converter<6> 
         {
-            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
-        }
-    };
-    template<> struct Gray2RGB5x5Converter<5> 
-    {
-        static __device__ __forceinline__ ushort cvt(uint t)
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+            }
+        };
+        template<> struct Gray2RGB5x5Converter<5> 
         {
-            t >>= 3;
-            return (ushort)(t | (t << 5) | (t << 10));
-        }
-    };
-
-    template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>
-    {
-        __device__ __forceinline__ ushort operator()(uint src) const
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                t >>= 3;
+                return (ushort)(t | (t << 5) | (t << 10));
+            }
+        };
+
+        template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>
         {
-            return Gray2RGB5x5Converter<green_bits>::cvt(src);
-        }
-    };
-}
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return Gray2RGB5x5Converter<green_bits>::cvt(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
     struct name ## _traits \
     { \
-        typedef detail::Gray2RGB5x5<green_bits> functor_type; \
+        typedef ::cv::gpu::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -355,81 +355,81 @@ namespace detail
 
 ///////////////////////////////// Color to Grayscale ////////////////////////////////
 
-namespace detail
-{
-    template <int green_bits> struct RGB5x52GrayConverter;
-    template <> struct RGB5x52GrayConverter<6> 
+    namespace color_detail
     {
-        static __device__ __forceinline__ uchar cvt(uint t)
+        template <int green_bits> struct RGB5x52GrayConverter;
+        template <> struct RGB5x52GrayConverter<6> 
         {
-            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
-        }
-    };
-    template <> struct RGB5x52GrayConverter<5> 
-    {
-        static __device__ __forceinline__ uchar cvt(uint t)
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
+            }
+        };
+        template <> struct RGB5x52GrayConverter<5> 
         {
-            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
-        }
-    };   
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
+            }
+        };   
 
-    template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>
-    {
-        __device__ __forceinline__ uchar operator()(uint src) const
+        template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>
         {
-            return RGB5x52GrayConverter<green_bits>::cvt(src);
-        }
-    };
-}
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB5x52GrayConverter<green_bits>::cvt(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
     struct name ## _traits \
     { \
-        typedef detail::RGB5x52Gray<green_bits> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB5x52Gray<green_bits> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
-    {
-        return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
-    }
-    template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)
+    namespace color_detail
     {
-        uint b = 0xffu & (src >> (bidx * 8));
-        uint g = 0xffu & (src >> 8);
-        uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
-        return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
-    }
-    template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
-    {
-        return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;
-    }
-
-    template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>
-    {
-        __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const
+        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
         {
-            return RGB2GrayConvert<bidx>(&src.x);
+            return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
         }
-    };
-    template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
-    {
-        __device__ __forceinline__ uchar operator()(uint src) const
+        template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)
         {
-            return RGB2GrayConvert<bidx>(src);
+            uint b = 0xffu & (src >> (bidx * 8));
+            uint g = 0xffu & (src >> 8);
+            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+            return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
         }
-    };
-}
+        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
+        {
+            return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;
+        }
+
+        template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>
+        {
+            __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                return RGB2GrayConvert<bidx>(&src.x);
+            }
+        };
+        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB2GrayConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2Gray<T, scn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -438,139 +438,139 @@ namespace detail
 
 ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
 
-namespace detail
-{
-    __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-    __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
-
-    template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
+    namespace color_detail
     {
-        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+        __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
+        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
 
-        const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
-        const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
-        const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
 
-        dst.x = saturate_cast<T>(Y);
-        dst.y = saturate_cast<T>(Cr);
-        dst.z = saturate_cast<T>(Cb);
-    }
-    template <int bidx> static __device__ uint RGB2YUVConvert(uint src)
-    {
-        const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
+            const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
 
-        const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
-        const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
-        const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+        template <int bidx> static __device__ uint RGB2YUVConvert(uint src)
+        {
+            const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
 
-        uint dst = 0;
+            const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
+            const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
 
-        dst |= saturate_cast<uchar>(Y);
-        dst |= saturate_cast<uchar>(Cr) << 8;
-        dst |= saturate_cast<uchar>(Cb) << 16;
+            uint dst = 0;
 
-        return dst;
-    }
-    template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)
-    {
-        dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];
-        dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
-        dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
-    }
+            dst |= saturate_cast<uchar>(Y);
+            dst |= saturate_cast<uchar>(Cr) << 8;
+            dst |= saturate_cast<uchar>(Cb) << 16;
 
-    template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
-            RGB2YUVConvert<bidx>(&src.x, dst);
             return dst;
         }
-    };
-    template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint src) const
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)
         {
-            return RGB2YUVConvert<bidx>(src);
+            dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YUVConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+        };
+        template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return RGB2YUVConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
-    __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; 
-
-    template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
-    {
-        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
-        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
-        const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
-
-        dst[bidx] = saturate_cast<D>(b);
-        dst[1] = saturate_cast<D>(g);
-        dst[bidx^2] = saturate_cast<D>(r);
-    }
-    template <int bidx> static __device__ uint YUV2RGBConvert(uint src)
+    namespace color_detail
     {
-        const int x = 0xff & (src);
-        const int y = 0xff & (src >> 8);
-        const int z = 0xff & (src >> 16);
-        
-        const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
-        const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
-        const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
-
-        uint dst = 0xffu << 24;
-
-        dst |= saturate_cast<uchar>(b) << (bidx * 8);
-        dst |= saturate_cast<uchar>(g) << 8;
-        dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+        __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
+        __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; 
 
-        return dst;
-    }
-    template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)
-    {
-        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
-        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
-        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
-    }
+        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
 
-    template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+        template <int bidx> static __device__ uint YUV2RGBConvert(uint src)
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+            
+            const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+            const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+            const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
 
-            YUV2RGBConvert<bidx>(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
 
             return dst;
         }
-    };
-    template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint src) const
+        template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)
         {
-            return YUV2RGBConvert<bidx>(src);
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YUV2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+        };
+        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YUV2RGBConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -579,139 +579,139 @@ namespace detail
 
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
     
-namespace detail
-{
-    __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-    __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
-
-    template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
+    namespace color_detail
     {
-        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
 
-        const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
-        const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
-        const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
 
-        dst.x = saturate_cast<T>(Y);
-        dst.y = saturate_cast<T>(Cr);
-        dst.z = saturate_cast<T>(Cb);
-    }
-    template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)
-    {
-        const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
+            const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
 
-        const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
-        const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
-        const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+        template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)
+        {
+            const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
 
-        uint dst = 0;
+            const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
 
-        dst |= saturate_cast<uchar>(Y);
-        dst |= saturate_cast<uchar>(Cr) << 8;
-        dst |= saturate_cast<uchar>(Cb) << 16;
+            uint dst = 0;
 
-        return dst;
-    }
-    template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)
-    {
-        dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];
-        dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
-        dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
-    }
+            dst |= saturate_cast<uchar>(Y);
+            dst |= saturate_cast<uchar>(Cr) << 8;
+            dst |= saturate_cast<uchar>(Cb) << 16;
 
-    template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
-            RGB2YCrCbConvert<bidx>(&src.x, dst);
             return dst;
         }
-    };
-    template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint src) const
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)
         {
-            return RGB2YCrCbConvert<bidx>(src);
+            dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YCrCbConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+        };
+        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return RGB2YCrCbConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
-    __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
-
-    template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
-    {
-        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
-        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
-        const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
-
-        dst[bidx] = saturate_cast<D>(b);
-        dst[1] = saturate_cast<D>(g);
-        dst[bidx^2] = saturate_cast<D>(r);
-    }
-    template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)
+    namespace color_detail
     {
-        const int x = 0xff & (src);
-        const int y = 0xff & (src >> 8);
-        const int z = 0xff & (src >> 16);
-        
-        const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
-        const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
-        const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
-
-        uint dst = 0xffu << 24;
-
-        dst |= saturate_cast<uchar>(b) << (bidx * 8);
-        dst |= saturate_cast<uchar>(g) << 8;
-        dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
+        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
 
-        return dst;
-    }
-    template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)
-    {
-        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
-        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
-        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
-    }
+        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
 
-    template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+        template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+            
+            const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
 
-            YCrCb2RGBConvert<bidx>(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
 
             return dst;
         }
-    };
-    template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint src) const
+        template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)
         {
-            return YCrCb2RGBConvert<bidx>(src);
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YCrCb2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+        };
+        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YCrCb2RGBConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -720,133 +720,133 @@ namespace detail
 
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
 
-namespace detail
-{
-    __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
-    __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
-
-    template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
-    {
-        dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
-        dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
-        dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
-    }
-    template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)
+    namespace color_detail
     {
-        const uint b = 0xffu & (src >> (bidx * 8));
-        const uint g = 0xffu & (src >> 8);
-        const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+        __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
+        __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
 
-        const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));
-        const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));
-        const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));
-
-        uint dst = 0;
-
-        dst |= x;
-        dst |= y << 8;
-        dst |= z << 16;
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
+        {
+            dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
+            dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
+            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
+        }
+        template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)
+        {
+            const uint b = 0xffu & (src >> (bidx * 8));
+            const uint g = 0xffu & (src >> 8);
+            const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
 
-        return dst;
-    }
-    template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
-    {
-        dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];
-        dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];
-        dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];
-    }
+            const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));
+            const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));
+            const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));
 
-    template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
+            uint dst = 0;
 
-            RGB2XYZConvert<bidx>(&src.x, dst);
+            dst |= x;
+            dst |= y << 8;
+            dst |= z << 16;
 
             return dst;
         }
-    };
-    template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
         {
-            return RGB2XYZConvert<bidx>(src);
+            dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];
+            dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];
+            dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2XYZConvert<bidx>(&src.x, dst);
+
+                return dst;
+            }
+        };
+        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2XYZConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
-    __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
-
-    template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
-    {
-        dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));
-        dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));
-        dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));
-    }
-    template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)
+    namespace color_detail
     {
-        const int x = 0xff & src;
-        const int y = 0xff & (src >> 8);
-        const int z = 0xff & (src >> 16);
-
-        const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));
-        const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));
-        const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));
-
-        uint dst = 0xffu << 24;
+        __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
+        __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
 
-        dst |= b << (bidx * 8);
-        dst |= g << 8;
-        dst |= r << ((bidx ^ 2) * 8);
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
+        {
+            dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));
+        }
+        template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)
+        {
+            const int x = 0xff & src;
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
 
-        return dst;
-    }
-    template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
-    {
-        dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
-        dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
-        dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
-    }
+            const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));
 
-    template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
+            uint dst = 0xffu << 24;
 
-            XYZ2RGBConvert<bidx>(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
+            dst |= b << (bidx * 8);
+            dst |= g << 8;
+            dst |= r << ((bidx ^ 2) * 8);
 
             return dst;
         }
-    };
-    template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+        template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
         {
-            return XYZ2RGBConvert<bidx>(src);
+            dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
+            dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
+            dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
         }
-    };
-}
+
+        template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                XYZ2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+        };
+        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return XYZ2RGBConvert<bidx>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
+        typedef ::cv::gpu::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -855,128 +855,128 @@ namespace detail
 
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
 
-namespace detail
-{
-    __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
-    __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
-    __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
-
-    template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)
-    {
-        const int hsv_shift = 12;
-        const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
-
-        int b = src[bidx], g = src[1], r = src[bidx^2];
-        int h, s, v = b;
-        int vmin = b, diff;
-        int vr, vg;
-
-        v = ::max(v, g);
-        v = ::max(v, r);
-        vmin = ::min(vmin, g);
-        vmin = ::min(vmin, r);
-
-        diff = v - vmin;
-        vr = (v == r) * -1;
-        vg = (v == g) * -1;
-
-        s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h += (h < 0) * hr;
-
-        dst.x = saturate_cast<uchar>(h);
-        dst.y = (uchar)s;
-        dst.z = (uchar)v;
-    }
-    template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)
-    {
-        const int hsv_shift = 12;
-        const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
-
-        const int b = 0xff & (src >> (bidx * 8));
-        const int g = 0xff & (src >> 8);
-        const int r = 0xff & (src >> ((bidx ^ 2) * 8));
-        
-        int h, s, v = b;
-        int vmin = b, diff;
-        int vr, vg;
-
-        v = ::max(v, g);
-        v = ::max(v, r);
-        vmin = ::min(vmin, g);
-        vmin = ::min(vmin, r);
-
-        diff = v - vmin;
-        vr = (v == r) * -1;
-        vg = (v == g) * -1;
-
-        s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h += (h < 0) * hr;
-
-        uint dst = 0;
-
-        dst |= saturate_cast<uchar>(h);
-        dst |= (0xffu & s) << 8;
-        dst |= (0xffu & v) << 16;
-
-        return dst;
-    }
-    template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)
+    namespace color_detail
     {
-        const float hscale = hr * (1.f / 360.f);
+        __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
+        __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
+        __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
 
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h, s, v;
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            int b = src[bidx], g = src[1], r = src[bidx^2];
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            dst.x = saturate_cast<uchar>(h);
+            dst.y = (uchar)s;
+            dst.z = (uchar)v;
+        }
+        template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            const int b = 0xff & (src >> (bidx * 8));
+            const int g = 0xff & (src >> 8);
+            const int r = 0xff & (src >> ((bidx ^ 2) * 8));
+            
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
 
-        float vmin, diff;
+            uint dst = 0;
 
-        v = vmin = r;
-        v = fmax(v, g);
-        v = fmax(v, b);
-        vmin = fmin(vmin, g);
-        vmin = fmin(vmin, b);
+            dst |= saturate_cast<uchar>(h);
+            dst |= (0xffu & s) << 8;
+            dst |= (0xffu & v) << 16;
 
-        diff = v - vmin;
-        s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
-        diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
+            return dst;
+        }
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
 
-        h  = (v == r) * (g - b) * diff;
-        h += (v != r && v == g) * ((b - r) * diff + 120.f);
-        h += (v != r && v != g) * ((r - g) * diff + 240.f);
-        h += (h < 0) * 360.f;
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h, s, v;
 
-        dst.x = h * hscale;
-        dst.y = s;
-        dst.z = v;
-    }
+            float vmin, diff;
 
-    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
+            v = vmin = r;
+            v = fmax(v, g);
+            v = fmax(v, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
 
-            RGB2HSVConvert<bidx, hr>(&src.x, dst);
+            diff = v - vmin;
+            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
 
-            return dst;
+            h  = (v == r) * (g - b) * diff;
+            h += (v != r && v == g) * ((b - r) * diff + 120.f);
+            h += (v != r && v != g) * ((r - g) * diff + 240.f);
+            h += (h < 0) * 360.f;
+
+            dst.x = h * hscale;
+            dst.y = s;
+            dst.z = v;
         }
-    };
-    template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
         {
-            return RGB2HSVConvert<bidx, hr>(src);
-        }
-    };
-}
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2HSVConvert<bidx, hr>(&src.x, dst);
+
+                return dst;
+            }
+        };
+        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HSVConvert<bidx, hr>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -984,7 +984,7 @@ namespace detail
     }; \
     template <typename T> struct name ## _full_traits \
     { \
-        typedef detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -992,7 +992,7 @@ namespace detail
     }; \
     template <> struct name ## _traits<float> \
     { \
-        typedef detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1000,109 +1000,109 @@ namespace detail
     }; \
     template <> struct name ## _full_traits<float> \
     { \
-        typedef detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
-
-    template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
+    namespace color_detail
     {
-        const float hscale = 6.f / hr;
-        
-        float h = src.x, s = src.y, v = src.z;
-        float b = v, g = v, r = v;
+        __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
 
-        if (s != 0)
+        template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
         {
-            h *= hscale;
-
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            int sector = __float2int_rd(h);
-            h -= sector;
-
-            float tab[4];
-            tab[0] = v;
-            tab[1] = v * (1.f - s);
-            tab[2] = v * (1.f - s * h);
-            tab[3] = v * (1.f - s * (1.f - h));
-
-            b = tab[c_HsvSectorData[sector][0]];
-            g = tab[c_HsvSectorData[sector][1]];
-            r = tab[c_HsvSectorData[sector][2]];
+            const float hscale = 6.f / hr;
+            
+            float h = src.x, s = src.y, v = src.z;
+            float b = v, g = v, r = v;
+
+            if (s != 0)
+            {
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector = __float2int_rd(h);
+                h -= sector;
+
+                float tab[4];
+                tab[0] = v;
+                tab[1] = v * (1.f - s);
+                tab[2] = v * (1.f - s * h);
+                tab[3] = v * (1.f - s * (1.f - h));
+
+                b = tab[c_HsvSectorData[sector][0]];
+                g = tab[c_HsvSectorData[sector][1]];
+                r = tab[c_HsvSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
         }
+        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
 
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
-    }
-    template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)
-    {
-        float3 buf;
-
-        buf.x = src.x;
-        buf.y = src.y * (1.f / 255.f);
-        buf.z = src.z * (1.f / 255.f);
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
 
-        HSV2RGBConvert<bidx, HR>(buf, &buf.x);
+            HSV2RGBConvert<bidx, HR>(buf, &buf.x);
 
-        dst[0] = saturate_cast<uchar>(buf.x * 255.f);
-        dst[1] = saturate_cast<uchar>(buf.y * 255.f);
-        dst[2] = saturate_cast<uchar>(buf.z * 255.f);
-    }
-    template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)
-    {
-        float3 buf;
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+        template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)
+        {
+            float3 buf;
 
-        buf.x = src & 0xff;
-        buf.y = ((src >> 8) & 0xff) * (1.f/255.f);
-        buf.z = ((src >> 16) & 0xff) * (1.f/255.f);
+            buf.x = src & 0xff;
+            buf.y = ((src >> 8) & 0xff) * (1.f/255.f);
+            buf.z = ((src >> 16) & 0xff) * (1.f/255.f);
 
-        HSV2RGBConvert<bidx, hr>(buf, &buf.x);
+            HSV2RGBConvert<bidx, hr>(buf, &buf.x);
 
-        uint dst = 0xffu << 24;
+            uint dst = 0xffu << 24;
 
-        dst |= saturate_cast<uchar>(buf.x * 255.f);
-        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
-        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
 
-        return dst;
-    }
+            return dst;
+        }
 
-    template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
 
-            HSV2RGBConvert<bidx, hr>(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
+                HSV2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
 
-            return dst;
-        }
-    };
-    template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+                return dst;
+            }
+        };
+        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
         {
-            return HSV2RGBConvert<bidx, hr>(src);
-        }
-    };
-}
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HSV2RGBConvert<bidx, hr>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1110,7 +1110,7 @@ namespace detail
     }; \
     template <typename T> struct name ## _full_traits \
     { \
-        typedef detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1118,7 +1118,7 @@ namespace detail
     }; \
     template <> struct name ## _traits<float> \
     { \
-        typedef detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1126,7 +1126,7 @@ namespace detail
     }; \
     template <> struct name ## _full_traits<float> \
     { \
-        typedef detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1135,99 +1135,99 @@ namespace detail
 
 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
 
-namespace detail
-{
-    template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
+    namespace color_detail
     {
-        const float hscale = hr * (1.f / 360.f);
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
 
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
 
-        vmax = vmin = r;
-        vmax = fmax(vmax, g);
-        vmax = fmax(vmax, b);
-        vmin = fmin(vmin, g);
-        vmin = fmin(vmin, b);
+            vmax = vmin = r;
+            vmax = fmax(vmax, g);
+            vmax = fmax(vmax, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
 
-        diff = vmax - vmin;
-        l = (vmax + vmin) * 0.5f;
+            diff = vmax - vmin;
+            l = (vmax + vmin) * 0.5f;
 
-        if (diff > numeric_limits<float>::epsilon())
-        {
-            s = (l < 0.5f) * diff / (vmax + vmin);
-            s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);
+            if (diff > numeric_limits<float>::epsilon())
+            {
+                s = (l < 0.5f) * diff / (vmax + vmin);
+                s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);
 
-            diff = 60.f / diff;
+                diff = 60.f / diff;
 
-            h  = (vmax == r) * (g - b) * diff;
-            h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);
-            h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);
-            h += (h < 0.f) * 360.f;
-        }
+                h  = (vmax == r) * (g - b) * diff;
+                h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);
+                h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);
+                h += (h < 0.f) * 360.f;
+            }
 
-        dst.x = h * hscale;
-        dst.y = l;
-        dst.z = s;
-    }
-    template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)
-    {
-        float3 buf;
+            dst.x = h * hscale;
+            dst.y = l;
+            dst.z = s;
+        }
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)
+        {
+            float3 buf;
 
-        buf.x = src[0] * (1.f / 255.f);
-        buf.y = src[1] * (1.f / 255.f);
-        buf.z = src[2] * (1.f / 255.f);
+            buf.x = src[0] * (1.f / 255.f);
+            buf.y = src[1] * (1.f / 255.f);
+            buf.z = src[2] * (1.f / 255.f);
 
-        RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
 
-        dst.x = saturate_cast<uchar>(buf.x);
-        dst.y = saturate_cast<uchar>(buf.y*255.f);
-        dst.z = saturate_cast<uchar>(buf.z*255.f);
-    }
-    template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)
-    {
-        float3 buf;
+            dst.x = saturate_cast<uchar>(buf.x);
+            dst.y = saturate_cast<uchar>(buf.y*255.f);
+            dst.z = saturate_cast<uchar>(buf.z*255.f);
+        }
+        template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)
+        {
+            float3 buf;
 
-        buf.x = (0xff & src) * (1.f / 255.f);
-        buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
-        buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+            buf.x = (0xff & src) * (1.f / 255.f);
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
 
-        RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
 
-        uint dst = 0xffu << 24;
+            uint dst = 0xffu << 24;
 
-        dst |= saturate_cast<uchar>(buf.x);
-        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
-        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+            dst |= saturate_cast<uchar>(buf.x);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
 
-        return dst;
-    }
+            return dst;
+        }
 
-    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
         {
-            typename TypeVec<T, dcn>::vec_type dst;
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
 
-            RGB2HLSConvert<bidx, hr>(&src.x, dst);
+                RGB2HLSConvert<bidx, hr>(&src.x, dst);
 
-            return dst;
-        }
-    };
-    template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+                return dst;
+            }
+        };
+        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
         {
-            return RGB2HLSConvert<bidx, hr>(src);
-        }
-    };
-}
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HLSConvert<bidx, hr>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1235,7 +1235,7 @@ namespace detail
     }; \
     template <typename T> struct name ## _full_traits \
     { \
-        typedef detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1243,7 +1243,7 @@ namespace detail
     }; \
     template <> struct name ## _traits<float> \
     { \
-        typedef detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1251,115 +1251,115 @@ namespace detail
     }; \
     template <> struct name ## _full_traits<float> \
     { \
-        typedef detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
 
-namespace detail
-{
-    __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
-
-    template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
+    namespace color_detail
     {
-        const float hscale = 6.0f / hr;
+        __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
 
-        float h = src.x, l = src.y, s = src.z;
-        float b = l, g = l, r = l;
-
-        if (s != 0)
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
         {
-            float p2  = (l <= 0.5f) * l * (1 + s);
-                  p2 += (l > 0.5f) * (l + s - l * s);
-            float p1 = 2 * l - p2;
-
-            h *= hscale;
+            const float hscale = 6.0f / hr;
 
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
+            float h = src.x, l = src.y, s = src.z;
+            float b = l, g = l, r = l;
 
-            int sector;
-            sector = __float2int_rd(h);
+            if (s != 0)
+            {
+                float p2  = (l <= 0.5f) * l * (1 + s);
+                      p2 += (l > 0.5f) * (l + s - l * s);
+                float p1 = 2 * l - p2;
 
-            h -= sector;
+                h *= hscale;
 
-            float tab[4];
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1) * (1 - h);
-            tab[3] = p1 + (p2 - p1) * h;
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
 
-            b = tab[c_HlsSectorData[sector][0]];
-            g = tab[c_HlsSectorData[sector][1]];
-            r = tab[c_HlsSectorData[sector][2]];
-        }
+                int sector;
+                sector = __float2int_rd(h);
 
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
-    }
-    template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)
-    {
-        float3 buf;
+                h -= sector;
 
-        buf.x = src.x;
-        buf.y = src.y * (1.f / 255.f);
-        buf.z = src.z * (1.f / 255.f);
+                float tab[4];
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1) * (1 - h);
+                tab[3] = p1 + (p2 - p1) * h;
 
-        HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+                b = tab[c_HlsSectorData[sector][0]];
+                g = tab[c_HlsSectorData[sector][1]];
+                r = tab[c_HlsSectorData[sector][2]];
+            }
 
-        dst[0] = saturate_cast<uchar>(buf.x * 255.f);
-        dst[1] = saturate_cast<uchar>(buf.y * 255.f);
-        dst[2] = saturate_cast<uchar>(buf.z * 255.f);
-    }
-    template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)
-    {
-        float3 buf;
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
 
-        buf.x = 0xff & src;
-        buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
-        buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
 
-        HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
 
-        uint dst = 0xffu << 24;
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+        template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)
+        {
+            float3 buf;
 
-        dst |= saturate_cast<uchar>(buf.x * 255.f);
-        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
-        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+            buf.x = 0xff & src;
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
 
-        return dst;
-    }
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
 
-    template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
-    {
-        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
-        {
-            typename TypeVec<T, dcn>::vec_type dst;
+            uint dst = 0xffu << 24;
 
-            HLS2RGBConvert<bidx, hr>(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
 
             return dst;
         }
-    };
-    template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
-    {
-        __device__ __forceinline__ uint operator()(uint src) const
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
         {
-            return HLS2RGBConvert<bidx, hr>(src);
-        }
-    };
-}
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                HLS2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+        };
+        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HLS2RGBConvert<bidx, hr>(src);
+            }
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
     template <typename T> struct name ## _traits \
     { \
-        typedef detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1367,7 +1367,7 @@ namespace detail
     }; \
     template <typename T> struct name ## _full_traits \
     { \
-        typedef detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1375,7 +1375,7 @@ namespace detail
     }; \
     template <> struct name ## _traits<float> \
     { \
-        typedef detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
@@ -1383,13 +1383,12 @@ namespace detail
     }; \
     template <> struct name ## _full_traits<float> \
     { \
-        typedef detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        typedef ::cv::gpu::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
         static __host__ __device__ __forceinline__ functor_type create_functor() \
         { \
             return functor_type(); \
         } \
     };
-
-END_OPENCV_DEVICE_NAMESPACE
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_COLOR_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
index 4e16be9..5efc8c2 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
@@ -47,365 +47,348 @@
 #include "../vec_traits.hpp"
 #include "../functional.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace detail
+namespace cv { namespace gpu { namespace device 
 {
-    //! Mask accessor
-
-    struct MaskReader
-    {
-        explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}
-
-        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
-
-        const PtrStepb mask;
-    };
-
-    struct NoMask 
+    namespace transform_detail
     {
-        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
-    };
+        //! Read Write Traits
 
-    //! Read Write Traits
-
-    template <typename T, typename D, int shift> struct UnaryReadWriteTraits
-    {
-        typedef typename TypeVec<T, shift>::vec_type read_type;
-        typedef typename TypeVec<D, shift>::vec_type write_type;
-    };
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
 
-    template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
-    {
-        typedef typename TypeVec<T1, shift>::vec_type read_type1;
-        typedef typename TypeVec<T2, shift>::vec_type read_type2;
-        typedef typename TypeVec<D, shift>::vec_type write_type;
-    };
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
 
-    //! Transform kernels
+        //! Transform kernels
 
-    template <int shift> struct OpUnroller;
-    template <> struct OpUnroller<1>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-        }
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
 
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-        }
-    };
-    template <> struct OpUnroller<2>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-        }
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
 
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-        }
-    };
-    template <> struct OpUnroller<3>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
-        }
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
 
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
-        }
-    };
-    template <> struct OpUnroller<4>
-    {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
         {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
-            if (mask(y, x_shifted + 3))
-                dst.w = op(src.w);
-        }
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
 
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
-            if (mask(y, x_shifted + 3))
-                dst.w = op(src1.w, src2.w);
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    write_type dst_n_el;
+
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
         }
-    };
-    template <> struct OpUnroller<8>
-    {
+
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
         {
-            if (mask(y, x_shifted))
-                dst.a0 = op(src.a0);
-            if (mask(y, x_shifted + 1))
-                dst.a1 = op(src.a1);
-            if (mask(y, x_shifted + 2))
-                dst.a2 = op(src.a2);
-            if (mask(y, x_shifted + 3))
-                dst.a3 = op(src.a3);
-            if (mask(y, x_shifted + 4))
-                dst.a4 = op(src.a4);
-            if (mask(y, x_shifted + 5))
-                dst.a5 = op(src.a5);
-            if (mask(y, x_shifted + 6))
-                dst.a6 = op(src.a6);
-            if (mask(y, x_shifted + 7))
-                dst.a7 = op(src.a7);
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
+            const Mask mask, const BinOp op)
         {
-            if (mask(y, x_shifted))
-                dst.a0 = op(src1.a0, src2.a0);
-            if (mask(y, x_shifted + 1))
-                dst.a1 = op(src1.a1, src2.a1);
-            if (mask(y, x_shifted + 2))
-                dst.a2 = op(src1.a2, src2.a2);
-            if (mask(y, x_shifted + 3))
-                dst.a3 = op(src1.a3, src2.a3);
-            if (mask(y, x_shifted + 4))
-                dst.a4 = op(src1.a4, src2.a4);
-            if (mask(y, x_shifted + 5))
-                dst.a5 = op(src1.a5, src2.a5);
-            if (mask(y, x_shifted + 6))
-                dst.a6 = op(src1.a6, src2.a6);
-            if (mask(y, x_shifted + 7))
-                dst.a7 = op(src1.a7, src2.a7);
-        }
-    };
-
-    template <typename T, typename D, typename UnOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
-    {
-        typedef TransformFunctorTraits<UnOp> ft;
-        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
-        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * ft::smart_shift;
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
 
-        if (y < src_.rows)
-        {
-            const T* src = src_.ptr(y);
-            D* dst = dst_.ptr(y);
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
 
-            if (x_shifted + ft::smart_shift - 1 < src_.cols)
+            if (y < src1_.rows)
             {
-                const read_type src_n_el = ((const read_type*)src)[x];
-                write_type dst_n_el;
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
 
-                OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                    write_type dst_n_el;
+                    
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
 
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
                 {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src[real_x]);
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
                 }
             }
         }
-    }
 
-    template <typename T, typename D, typename UnOp, typename Mask>
-    static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
-    {
-	    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < src.cols && y < src.rows && mask(y, x))
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
+            const Mask mask, const BinOp op)
         {
-            dst.ptr(y)[x] = op(src.ptr(y)[x]);
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
         }
-    }
 
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
-        const Mask mask, const BinOp op)
-    {
-        typedef TransformFunctorTraits<BinOp> ft;
-        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
-        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
-        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
 
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * ft::smart_shift;
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     
 
-        if (y < src1_.rows)
-        {
-            const T1* src1 = src1_.ptr(y);
-            const T2* src2 = src2_.ptr(y);
-            D* dst = dst_.ptr(y);
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() ); 
+            }
 
-            if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
             {
-                const read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                const read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                write_type dst_n_el;
-                
-                OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+                typedef TransformFunctorTraits<BinOp> ft;
 
-                ((write_type*)dst)[x] = dst_n_el;
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );            
             }
-            else
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
             {
-                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src1[real_x], src2[real_x]);
-                }
-            }
-        }
-    }
+                typedef TransformFunctorTraits<UnOp> ft;
 
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
-        const Mask mask, const BinOp op)
-    {
-	    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+                StaticAssert<ft::smart_shift != 1>::check();
 
-        if (x < src1.cols && y < src1.rows && mask(y, x))
-        {
-            const T1 src1_data = src1.ptr(y)[x];
-            const T2 src2_data = src2.ptr(y)[x];
-            dst.ptr(y)[x] = op(src1_data, src2_data);
-        }
-    }
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      
 
-    template <bool UseSmart> struct TransformDispatcher;
-    template<> struct TransformDispatcher<false>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
-        {
-            typedef TransformFunctorTraits<UnOp> ft;
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
 
-            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-            const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
 
-            transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-            cudaSafeCall( cudaGetLastError() );
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() ); 
-        }
+                StaticAssert<ft::smart_shift != 1>::check();
 
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
-        {
-            typedef TransformFunctorTraits<BinOp> ft;
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    
 
-            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-            const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
 
-            transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-            cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );            
+            }
+        };        
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );            
-        }
-    };
-    template<> struct TransformDispatcher<true>
-    {
         template <typename T, typename D, typename UnOp, typename Mask>
-        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
         {
             typedef TransformFunctorTraits<UnOp> ft;
-
-            StaticAssert<ft::smart_shift != 1>::check();
-
-            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-            const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      
-
-            transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
+            TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
         {
             typedef TransformFunctorTraits<BinOp> ft;
-
-            StaticAssert<ft::smart_shift != 1>::check();
-
-            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-            const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    
-
-            transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );            
+            TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
         }
-    };        
-
-    template <typename T, typename D, typename UnOp, typename Mask>
-    static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
-    {
-        typedef TransformFunctorTraits<UnOp> ft;
-        TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
-    }
-
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
-    {
-        typedef TransformFunctorTraits<BinOp> ft;
-        TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
-    }
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    } // namespace transform_detail
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
index f2eb828..84af370 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
@@ -46,143 +46,142 @@
 #include "internal_shared.hpp"
 #include "../vec_traits.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace detail
+namespace cv { namespace gpu { namespace device 
 {
-    template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
-    template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
-
-    template <typename T> struct IsSignedIntergral { enum {value = 0}; };
-    template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
-    template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
-    template <> struct IsSignedIntergral<short> { enum {value = 1}; };
-    template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
-    template <> struct IsSignedIntergral<int> { enum {value = 1}; };
-    template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
 
-    template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
-    template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
-    template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
-    template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
-    template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
-    template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
-    template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
 
-    template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
-    template <> struct IsIntegral<char> { enum {value = 1}; };
-    template <> struct IsIntegral<bool> { enum {value = 1}; };
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
 
-    template <typename T> struct IsFloat { enum {value = 0}; };
-    template <> struct IsFloat<float> { enum {value = 1}; };
-    template <> struct IsFloat<double> { enum {value = 1}; };
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
 
-    template <typename T> struct IsVec { enum {value = 0}; };
-    template <> struct IsVec<uchar1> { enum {value = 1}; };
-    template <> struct IsVec<uchar2> { enum {value = 1}; };
-    template <> struct IsVec<uchar3> { enum {value = 1}; };
-    template <> struct IsVec<uchar4> { enum {value = 1}; };
-    template <> struct IsVec<uchar8> { enum {value = 1}; };
-    template <> struct IsVec<char1> { enum {value = 1}; };
-    template <> struct IsVec<char2> { enum {value = 1}; };
-    template <> struct IsVec<char3> { enum {value = 1}; };
-    template <> struct IsVec<char4> { enum {value = 1}; };
-    template <> struct IsVec<char8> { enum {value = 1}; };
-    template <> struct IsVec<ushort1> { enum {value = 1}; };
-    template <> struct IsVec<ushort2> { enum {value = 1}; };
-    template <> struct IsVec<ushort3> { enum {value = 1}; };
-    template <> struct IsVec<ushort4> { enum {value = 1}; };
-    template <> struct IsVec<ushort8> { enum {value = 1}; };
-    template <> struct IsVec<short1> { enum {value = 1}; };
-    template <> struct IsVec<short2> { enum {value = 1}; };
-    template <> struct IsVec<short3> { enum {value = 1}; };
-    template <> struct IsVec<short4> { enum {value = 1}; };
-    template <> struct IsVec<short8> { enum {value = 1}; };
-    template <> struct IsVec<uint1> { enum {value = 1}; };
-    template <> struct IsVec<uint2> { enum {value = 1}; };
-    template <> struct IsVec<uint3> { enum {value = 1}; };
-    template <> struct IsVec<uint4> { enum {value = 1}; };
-    template <> struct IsVec<uint8> { enum {value = 1}; };
-    template <> struct IsVec<int1> { enum {value = 1}; };
-    template <> struct IsVec<int2> { enum {value = 1}; };
-    template <> struct IsVec<int3> { enum {value = 1}; };
-    template <> struct IsVec<int4> { enum {value = 1}; };
-    template <> struct IsVec<int8> { enum {value = 1}; };
-    template <> struct IsVec<float1> { enum {value = 1}; };
-    template <> struct IsVec<float2> { enum {value = 1}; };
-    template <> struct IsVec<float3> { enum {value = 1}; };
-    template <> struct IsVec<float4> { enum {value = 1}; };
-    template <> struct IsVec<float8> { enum {value = 1}; };
-    template <> struct IsVec<double1> { enum {value = 1}; };
-    template <> struct IsVec<double2> { enum {value = 1}; };
-    template <> struct IsVec<double3> { enum {value = 1}; };
-    template <> struct IsVec<double4> { enum {value = 1}; };
-    template <> struct IsVec<double8> { enum {value = 1}; };
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
 
-    template <class U> struct AddParameterType { typedef const U& type; };
-    template <class U> struct AddParameterType<U&> { typedef U& type; };
-    template <> struct AddParameterType<void> { typedef void type; };
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
 
-    template <class U> struct ReferenceTraits 
-    {
-        enum { value = false };
-        typedef U type;
-    };        
-    template <class U> struct ReferenceTraits<U&>
-    {
-        enum { value = true };
-        typedef U type;
-    };
-           
-    template <class U> struct PointerTraits
-    {
-        enum { value = false };
-        typedef void type;
-    };        
-    template <class U> struct PointerTraits<U*>
-    {
-        enum { value = true };
-        typedef U type;
-    };        
-    template <class U> struct PointerTraits<U*&>
-    {
-        enum { value = true };
-        typedef U type;
-    };
-     
-    template <class U> struct UnConst
-    {
-        typedef U type;
-        enum { value = 0 };
-    };        
-    template <class U> struct UnConst<const U>
-    {
-        typedef U type;
-        enum { value = 1 };
-    };
-    template <class U> struct UnConst<const U&>
-    {
-        typedef U& type;
-        enum { value = 1 };
-    };
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
 
-    template <class U> struct UnVolatile
-    {
-        typedef U type;
-        enum { value = 0 };
-    };       
-    template <class U> struct UnVolatile<volatile U>
-    {
-        typedef U type;
-        enum { value = 1 };
-    };
-    template <class U> struct UnVolatile<volatile U&>
-    {
-        typedef U& type;
-        enum { value = 1 };
-    };
-}
+        template <class U> struct ReferenceTraits 
+        {
+            enum { value = false };
+            typedef U type;
+        };        
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+               
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };        
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };        
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+         
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };        
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
 
-END_OPENCV_DEVICE_NAMESPACE
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };       
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
index 2818c28..39b599f 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -45,460 +45,225 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace detail
+namespace cv { namespace gpu { namespace device 
 {
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n> struct WarpReductor
+    namespace utility_detail
     {
-        template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            if (tid < n)
-                data[tid] = partial_reduction;                
-            if (n > 32) __syncthreads();
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
 
-            if (n > 32)
+        template <int n> struct WarpReductor
+        {
+            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                if (tid < n - 32) 
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                if (tid < 16)
+                if (tid < n)
+                    data[tid] = partial_reduction;                
+                if (n > 32) __syncthreads();
+
+                if (n > 32)
                 {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    if (tid < n - 32) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    if (tid < 16)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
                 }
-            }
-            else if (n > 16)
-            {
-                if (tid < n - 16) 
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                if (tid < 8)
+                else if (n > 16)
                 {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    if (tid < n - 16) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    if (tid < 8)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
                 }
-            }
-            else if (n > 8)
-            {
-                if (tid < n - 8) 
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                if (tid < 4)
+                else if (n > 8)
                 {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    if (tid < n - 8) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                    if (tid < 4)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
                 }
-            }
-            else if (n > 4)
-            {
-                if (tid < n - 4) 
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                if (tid < 2)
+                else if (n > 4)
                 {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                }
-            }   
-            else if (n > 2)
-            {
-                if (tid < n - 2) 
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                if (tid < 2)
+                    if (tid < n - 4) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                    if (tid < 2)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }   
+                else if (n > 2)
                 {
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-                }
-            }      
-        }
-    };
-    template <> struct WarpReductor<64>
-    {
-        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            data[tid] = partial_reduction;
-            __syncthreads();
-            
-            if (tid < 32) 
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
-            }
-        }
-    };
-    template <> struct WarpReductor<32>
-    {
-        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            data[tid] = partial_reduction;
-            
-            if (tid < 16) 
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
-            }
-        }
-    };
-    template <> struct WarpReductor<16>
-    {
-        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            data[tid] = partial_reduction;
-            
-            if (tid < 8) 
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+                    if (tid < n - 2) 
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                    if (tid < 2)
+                    {
+                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+                    }
+                }      
             }
-        }
-    };
-    template <> struct WarpReductor<8>
-    {
-        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+        };
+        template <> struct WarpReductor<64>
         {
-            data[tid] = partial_reduction;
-            
-            if (tid < 4) 
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
-            }
-        }
-    };
-
-    template <bool warp> struct ReductionDispatcher;
-    template <> struct ReductionDispatcher<true>
-    {
-        template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            WarpReductor<n>::reduce(data, partial_reduction, tid, op);
-        }
-    };
-    template <> struct ReductionDispatcher<false>
-    {
-        template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-        {
-            if (tid < n)
                 data[tid] = partial_reduction;
-            __syncthreads();
-
-
-            if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
-            if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
-            if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
-
-            if (tid < 32)
-            {
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
-                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
-            }
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // PredValWarpReductor
-    
-    template <int n> struct PredValWarpReductor;
-    template <> struct PredValWarpReductor<64>
-    {
-        template <typename T, typename V, typename Pred> 
-        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-        {
-            if (tid < 32)
-            {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                T reg = sdata[tid + 32];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 32];
-                }
-
-                reg = sdata[tid + 16];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 16];
-                }
-
-                reg = sdata[tid + 8];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 8];
-                }
-
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
+                __syncthreads();
+                
+                if (tid < 32) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 1];
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
                 }
             }
-        }
-    };
-    template <> struct PredValWarpReductor<32>
-    {
-        template <typename T, typename V, typename Pred> 
-        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+        };
+        template <> struct WarpReductor<32>
         {
-            if (tid < 16)
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                T reg = sdata[tid + 16];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 16];
-                }
-
-                reg = sdata[tid + 8];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 8];
-                }
-
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
+                data[tid] = partial_reduction;
+                
+                if (tid < 16) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 1];
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
                 }
             }
-        }
-    };
-
-    template <> struct PredValWarpReductor<16>
-    {
-        template <typename T, typename V, typename Pred> 
-        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+        };
+        template <> struct WarpReductor<16>
         {
-            if (tid < 8)
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                T reg = reg = sdata[tid + 8];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 8];
-                }
-
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
+                data[tid] = partial_reduction;
+                
+                if (tid < 8) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 1];
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
                 }
             }
-        }
-    };
-    template <> struct PredValWarpReductor<8>
-    {
-        template <typename T, typename V, typename Pred> 
-        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+        };
+        template <> struct WarpReductor<8>
         {
-            if (tid < 4)
+            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                myData = sdata[tid];
-                myVal = sval[tid];
-
-                T reg = reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
+                data[tid] = partial_reduction;
+                
+                if (tid < 4) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 1];
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
                 }
             }
-        }
-    };
+        };
 
-    template <bool warp> struct PredValReductionDispatcher;
-    template <> struct PredValReductionDispatcher<true>
-    {
-        template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
-        {
-            PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
-        }
-    };
-    template <> struct PredValReductionDispatcher<false>
-    {
-        template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+        template <bool warp> struct ReductionDispatcher;
+        template <> struct ReductionDispatcher<true>
         {
-            myData = sdata[tid];
-            myVal = sval[tid];
-
-            if (n >= 512 && tid < 256) 
+            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                T reg = sdata[tid + 256];
-
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 256];
-                }
-                __syncthreads(); 
+                WarpReductor<n>::reduce(data, partial_reduction, tid, op);
             }
-            if (n >= 256 && tid < 128) 
+        };
+        template <> struct ReductionDispatcher<false>
+        {
+            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
             {
-                T reg = sdata[tid + 128];
+                if (tid < n)
+                    data[tid] = partial_reduction;
+                __syncthreads();
+
+
+                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
+                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
+                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }
 
-                if (pred(reg, myData))
+                if (tid < 32)
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 128];
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
                 }
-                __syncthreads(); 
             }
-            if (n >= 128 && tid < 64) 
-            {
-                T reg = sdata[tid + 64];
+        };
 
-                if (pred(reg, myData))
+        ///////////////////////////////////////////////////////////////////////////////
+        // PredValWarpReductor
+        
+        template <int n> struct PredValWarpReductor;
+        template <> struct PredValWarpReductor<64>
+        {
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                if (tid < 32)
                 {
-                    sdata[tid] = myData = reg;
-                    sval[tid] = myVal = sval[tid + 64];
-                }
-                __syncthreads(); 
-            }        
+                    myData = sdata[tid];
+                    myVal = sval[tid];
 
-            if (tid < 32)
-            {
-                if (n >= 64) 
-                { 
                     T reg = sdata[tid + 32];
-
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval[tid] = myVal = sval[tid + 32];
                     }
-                }
-                if (n >= 32) 
-                { 
-                    T reg = sdata[tid + 16];
 
+                    reg = sdata[tid + 16];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval[tid] = myVal = sval[tid + 16];
                     }
-                }
-                if (n >= 16) 
-                { 
-                    T reg = sdata[tid + 8];
 
+                    reg = sdata[tid + 8];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval[tid] = myVal = sval[tid + 8];
                     }
-                }
-                if (n >= 8) 
-                { 
-                    T reg = sdata[tid + 4];
 
+                    reg = sdata[tid + 4];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval[tid] = myVal = sval[tid + 4];
                     }
-                }
-                if (n >= 4) 
-                { 
-                    T reg = sdata[tid + 2];
-
+                
+                    reg = sdata[tid + 2];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval[tid] = myVal = sval[tid + 2];
-                    } 
-                }
-                if (n >= 2) 
-                { 
-                    T reg = sdata[tid + 1];
-
+                    }
+                
+                    reg = sdata[tid + 1];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
@@ -506,327 +271,436 @@ namespace detail
                     }
                 }
             }
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // PredVal2WarpReductor
-
-    template <int n> struct PredVal2WarpReductor;
-    template <> struct PredVal2WarpReductor<64>
-    {
-        template <typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+        };
+        template <> struct PredValWarpReductor<32>
         {
-            if (tid < 32)
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
             {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                T reg = sdata[tid + 32];
-                if (pred(reg, myData))
+                if (tid < 16)
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 32];
-                    sval2[tid] = myVal2 = sval2[tid + 32];
-                }
+                    myData = sdata[tid];
+                    myVal = sval[tid];
 
-                reg = sdata[tid + 16];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 16];
-                    sval2[tid] = myVal2 = sval2[tid + 16];
-                }
+                    T reg = sdata[tid + 16];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 16];
+                    }
 
-                reg = sdata[tid + 8];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 8];
-                    sval2[tid] = myVal2 = sval2[tid + 8];
-                }
+                    reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 8];
+                    }
 
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 4];
-                    sval2[tid] = myVal2 = sval2[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 2];
-                    sval2[tid] = myVal2 = sval2[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 1];
-                    sval2[tid] = myVal2 = sval2[tid + 1];
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
                 }
             }
-        }
-    };
-    template <> struct PredVal2WarpReductor<32>
-    {
-        template <typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+        };
+
+        template <> struct PredValWarpReductor<16>
         {
-            if (tid < 16)
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
             {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                T reg = sdata[tid + 16];
-                if (pred(reg, myData))
+                if (tid < 8)
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 16];
-                    sval2[tid] = myVal2 = sval2[tid + 16];
-                }
+                    myData = sdata[tid];
+                    myVal = sval[tid];
 
-                reg = sdata[tid + 8];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 8];
-                    sval2[tid] = myVal2 = sval2[tid + 8];
-                }
+                    T reg = reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 8];
+                    }
 
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 4];
-                    sval2[tid] = myVal2 = sval2[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 2];
-                    sval2[tid] = myVal2 = sval2[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 1];
-                    sval2[tid] = myVal2 = sval2[tid + 1];
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
                 }
             }
-        }
-    };
-
-    template <> struct PredVal2WarpReductor<16>
-    {
-        template <typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+        };
+        template <> struct PredValWarpReductor<8>
         {
-            if (tid < 8)
+            template <typename T, typename V, typename Pred> 
+            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
             {
-                myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
-
-                T reg = reg = sdata[tid + 8];
-                if (pred(reg, myData))
+                if (tid < 4)
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 8];
-                    sval2[tid] = myVal2 = sval2[tid + 8];
-                }
+                    myData = sdata[tid];
+                    myVal = sval[tid];
 
-                reg = sdata[tid + 4];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 4];
-                    sval2[tid] = myVal2 = sval2[tid + 4];
-                }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 2];
-                    sval2[tid] = myVal2 = sval2[tid + 2];
-                }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 1];
-                    sval2[tid] = myVal2 = sval2[tid + 1];
+                    T reg = reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 1];
+                    }
                 }
             }
-        }
-    };
-    template <> struct PredVal2WarpReductor<8>
-    {
-        template <typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+        };
+
+        template <bool warp> struct PredValReductionDispatcher;
+        template <> struct PredValReductionDispatcher<true>
         {
-            if (tid < 4)
+            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
+            {
+                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
+            }
+        };
+        template <> struct PredValReductionDispatcher<false>
+        {
+            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
             {
                 myData = sdata[tid];
-                myVal1 = sval1[tid];
-                myVal2 = sval2[tid];
+                myVal = sval[tid];
 
-                T reg = reg = sdata[tid + 4];
-                if (pred(reg, myData))
+                if (n >= 512 && tid < 256) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 4];
-                    sval2[tid] = myVal2 = sval2[tid + 4];
+                    T reg = sdata[tid + 256];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 256];
+                    }
+                    __syncthreads(); 
                 }
-            
-                reg = sdata[tid + 2];
-                if (pred(reg, myData))
+                if (n >= 256 && tid < 128) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 2];
-                    sval2[tid] = myVal2 = sval2[tid + 2];
+                    T reg = sdata[tid + 128];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 128];
+                    }
+                    __syncthreads(); 
                 }
-            
-                reg = sdata[tid + 1];
-                if (pred(reg, myData))
+                if (n >= 128 && tid < 64) 
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 1];
-                    sval2[tid] = myVal2 = sval2[tid + 1];
-                }
-            }
-        }
-    };
-
-    template <bool warp> struct PredVal2ReductionDispatcher;
-    template <> struct PredVal2ReductionDispatcher<true>
-    {
-        template <int n, typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-        {
-            PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-        }
-    };
-    template <> struct PredVal2ReductionDispatcher<false>
-    {
-        template <int n, typename T, typename V1, typename V2, typename Pred> 
-        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
-        {
-            myData = sdata[tid];
-            myVal1 = sval1[tid];
-            myVal2 = sval2[tid];
+                    T reg = sdata[tid + 64];
 
-            if (n >= 512 && tid < 256) 
-            {
-                T reg = sdata[tid + 256];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval[tid] = myVal = sval[tid + 64];
+                    }
+                    __syncthreads(); 
+                }        
 
-                if (pred(reg, myData))
+                if (tid < 32)
                 {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 256];
-                    sval2[tid] = myVal2 = sval2[tid + 256];
-                }
-                __syncthreads(); 
-            }
-            if (n >= 256 && tid < 128) 
-            {
-                T reg = sdata[tid + 128];
+                    if (n >= 64) 
+                    { 
+                        T reg = sdata[tid + 32];
 
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 128];
-                    sval2[tid] = myVal2 = sval2[tid + 128];
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 32];
+                        }
+                    }
+                    if (n >= 32) 
+                    { 
+                        T reg = sdata[tid + 16];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 16];
+                        }
+                    }
+                    if (n >= 16) 
+                    { 
+                        T reg = sdata[tid + 8];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 8];
+                        }
+                    }
+                    if (n >= 8) 
+                    { 
+                        T reg = sdata[tid + 4];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 4];
+                        }
+                    }
+                    if (n >= 4) 
+                    { 
+                        T reg = sdata[tid + 2];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 2];
+                        } 
+                    }
+                    if (n >= 2) 
+                    { 
+                        T reg = sdata[tid + 1];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval[tid] = myVal = sval[tid + 1];
+                        }
+                    }
                 }
-                __syncthreads(); 
             }
-            if (n >= 128 && tid < 64) 
-            {
-                T reg = sdata[tid + 64];
+        };
 
-                if (pred(reg, myData))
-                {
-                    sdata[tid] = myData = reg;
-                    sval1[tid] = myVal1 = sval1[tid + 64];
-                    sval2[tid] = myVal2 = sval2[tid + 64];
-                }
-                __syncthreads(); 
-            }        
+        ///////////////////////////////////////////////////////////////////////////////
+        // PredVal2WarpReductor
 
-            if (tid < 32)
+        template <int n> struct PredVal2WarpReductor;
+        template <> struct PredVal2WarpReductor<64>
+        {
+            template <typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
             {
-                if (n >= 64) 
-                { 
-                    T reg = sdata[tid + 32];
+                if (tid < 32)
+                {
+                    myData = sdata[tid];
+                    myVal1 = sval1[tid];
+                    myVal2 = sval2[tid];
 
+                    T reg = sdata[tid + 32];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval1[tid] = myVal1 = sval1[tid + 32];
                         sval2[tid] = myVal2 = sval2[tid + 32];
                     }
-                }
-                if (n >= 32) 
-                { 
-                    T reg = sdata[tid + 16];
 
+                    reg = sdata[tid + 16];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval1[tid] = myVal1 = sval1[tid + 16];
                         sval2[tid] = myVal2 = sval2[tid + 16];
                     }
-                }
-                if (n >= 16) 
-                { 
-                    T reg = sdata[tid + 8];
 
+                    reg = sdata[tid + 8];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval1[tid] = myVal1 = sval1[tid + 8];
                         sval2[tid] = myVal2 = sval2[tid + 8];
                     }
+
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 4];
+                        sval2[tid] = myVal2 = sval2[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 2];
+                        sval2[tid] = myVal2 = sval2[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 1];
+                        sval2[tid] = myVal2 = sval2[tid + 1];
+                    }
                 }
-                if (n >= 8) 
-                { 
-                    T reg = sdata[tid + 4];
+            }
+        };
+        template <> struct PredVal2WarpReductor<32>
+        {
+            template <typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+            {
+                if (tid < 16)
+                {
+                    myData = sdata[tid];
+                    myVal1 = sval1[tid];
+                    myVal2 = sval2[tid];
+
+                    T reg = sdata[tid + 16];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 16];
+                        sval2[tid] = myVal2 = sval2[tid + 16];
+                    }
+
+                    reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 8];
+                        sval2[tid] = myVal2 = sval2[tid + 8];
+                    }
 
+                    reg = sdata[tid + 4];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval1[tid] = myVal1 = sval1[tid + 4];
                         sval2[tid] = myVal2 = sval2[tid + 4];
                     }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 2];
+                        sval2[tid] = myVal2 = sval2[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 1];
+                        sval2[tid] = myVal2 = sval2[tid + 1];
+                    }
                 }
-                if (n >= 4) 
-                { 
-                    T reg = sdata[tid + 2];
+            }
+        };
+
+        template <> struct PredVal2WarpReductor<16>
+        {
+            template <typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+            {
+                if (tid < 8)
+                {
+                    myData = sdata[tid];
+                    myVal1 = sval1[tid];
+                    myVal2 = sval2[tid];
+
+                    T reg = reg = sdata[tid + 8];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 8];
+                        sval2[tid] = myVal2 = sval2[tid + 8];
+                    }
 
+                    reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 4];
+                        sval2[tid] = myVal2 = sval2[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
                         sval1[tid] = myVal1 = sval1[tid + 2];
                         sval2[tid] = myVal2 = sval2[tid + 2];
-                    } 
+                    }
+                
+                    reg = sdata[tid + 1];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 1];
+                        sval2[tid] = myVal2 = sval2[tid + 1];
+                    }
                 }
-                if (n >= 2) 
-                { 
-                    T reg = sdata[tid + 1];
+            }
+        };
+        template <> struct PredVal2WarpReductor<8>
+        {
+            template <typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+            {
+                if (tid < 4)
+                {
+                    myData = sdata[tid];
+                    myVal1 = sval1[tid];
+                    myVal2 = sval2[tid];
 
+                    T reg = reg = sdata[tid + 4];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 4];
+                        sval2[tid] = myVal2 = sval2[tid + 4];
+                    }
+                
+                    reg = sdata[tid + 2];
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 2];
+                        sval2[tid] = myVal2 = sval2[tid + 2];
+                    }
+                
+                    reg = sdata[tid + 1];
                     if (pred(reg, myData))
                     {
                         sdata[tid] = myData = reg;
@@ -835,10 +709,135 @@ namespace detail
                     }
                 }
             }
-        }
-    };
-}
+        };
+
+        template <bool warp> struct PredVal2ReductionDispatcher;
+        template <> struct PredVal2ReductionDispatcher<true>
+        {
+            template <int n, typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+            {
+                PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
+            }
+        };
+        template <> struct PredVal2ReductionDispatcher<false>
+        {
+            template <int n, typename T, typename V1, typename V2, typename Pred> 
+            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
+            {
+                myData = sdata[tid];
+                myVal1 = sval1[tid];
+                myVal2 = sval2[tid];
+
+                if (n >= 512 && tid < 256) 
+                {
+                    T reg = sdata[tid + 256];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 256];
+                        sval2[tid] = myVal2 = sval2[tid + 256];
+                    }
+                    __syncthreads(); 
+                }
+                if (n >= 256 && tid < 128) 
+                {
+                    T reg = sdata[tid + 128];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 128];
+                        sval2[tid] = myVal2 = sval2[tid + 128];
+                    }
+                    __syncthreads(); 
+                }
+                if (n >= 128 && tid < 64) 
+                {
+                    T reg = sdata[tid + 64];
+
+                    if (pred(reg, myData))
+                    {
+                        sdata[tid] = myData = reg;
+                        sval1[tid] = myVal1 = sval1[tid + 64];
+                        sval2[tid] = myVal2 = sval2[tid + 64];
+                    }
+                    __syncthreads(); 
+                }        
 
-END_OPENCV_DEVICE_NAMESPACE
+                if (tid < 32)
+                {
+                    if (n >= 64) 
+                    { 
+                        T reg = sdata[tid + 32];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 32];
+                            sval2[tid] = myVal2 = sval2[tid + 32];
+                        }
+                    }
+                    if (n >= 32) 
+                    { 
+                        T reg = sdata[tid + 16];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 16];
+                            sval2[tid] = myVal2 = sval2[tid + 16];
+                        }
+                    }
+                    if (n >= 16) 
+                    { 
+                        T reg = sdata[tid + 8];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 8];
+                            sval2[tid] = myVal2 = sval2[tid + 8];
+                        }
+                    }
+                    if (n >= 8) 
+                    { 
+                        T reg = sdata[tid + 4];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 4];
+                            sval2[tid] = myVal2 = sval2[tid + 4];
+                        }
+                    }
+                    if (n >= 4) 
+                    { 
+                        T reg = sdata[tid + 2];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 2];
+                            sval2[tid] = myVal2 = sval2[tid + 2];
+                        } 
+                    }
+                    if (n >= 2) 
+                    { 
+                        T reg = sdata[tid + 1];
+
+                        if (pred(reg, myData))
+                        {
+                            sdata[tid] = myData = reg;
+                            sval1[tid] = myVal1 = sval1[tid + 1];
+                            sval2[tid] = myVal2 = sval2[tid + 1];
+                        }
+                    }
+                }
+            }
+        };
+    } // namespace utility_detail
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
index b27fd75..3538ca9 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
@@ -46,74 +46,73 @@
 #include "internal_shared.hpp"
 #include "../datamov_utils.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace detail
+namespace cv { namespace gpu { namespace device 
 {
-    template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+    namespace vec_distance_detail
     {
-        template <typename Dist, typename T1, typename T2>
-        static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
         {
-            if (ind < len)
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
             {
-                T1 val1 = *vecCached++;
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
 
-                T2 val2;
-                ForceGlob<T2>::Load(vecGlob, ind, val2);
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
 
-                dist.reduceIter(val1, val2);
+                    dist.reduceIter(val1, val2);
 
-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
             }
-        }
 
-        template <typename Dist, typename T1, typename T2>
-        static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
-        {
-            T1 val1 = *vecCached++;
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
 
-            T2 val2;
-            ForceGlob<T2>::Load(vecGlob, 0, val2);
-            vecGlob += THREAD_DIM;
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
 
-            dist.reduceIter(val1, val2);
+                dist.reduceIter(val1, val2);
 
-            UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
-        }
-    };
-    template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
-    {
-        template <typename Dist, typename T1, typename T2>
-        static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
         {
-        }
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
 
-        template <typename Dist, typename T1, typename T2>
-        static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
-        {
-        }
-    };
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
 
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
-    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
-    {
-        template <typename Dist, typename T1, typename T2>
-        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
         {
-            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
-        }
-    };
-    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
-    {
-        template <typename Dist, typename T1, typename T2>
-        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
         {
-            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
-        }
-    };
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
index 7220c81..e598986 100644
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -46,23 +46,22 @@
 #include "internal_shared.hpp"
 #include "warp_reduce.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-struct Emulation
+namespace cv { namespace gpu { namespace device 
 {
-	static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
-	{
-#if __CUDA_ARCH__ >= 200
-		(void)cta_buffer;
-		return __ballot(predicate);
-#else
-		int tid = threadIdx.x;				
-		cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
-		return warp_reduce(cta_buffer);
-#endif
-	}
-};
-
-END_OPENCV_DEVICE_NAMESPACE
+    struct Emulation
+    {
+	    static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
+	    {
+    #if __CUDA_ARCH__ >= 200
+		    (void)cta_buffer;
+		    return __ballot(predicate);
+    #else
+		    int tid = threadIdx.x;				
+		    cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+		    return warp_reduce(cta_buffer);
+    #endif
+	    }
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif /* OPENCV_GPU_EMULATION_HPP_ */
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/filters.hpp b/modules/gpu/src/opencv2/gpu/device/filters.hpp
index 5c54bd9..5ecf051 100644
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@@ -48,90 +48,89 @@
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename Ptr2D> struct PointFilter
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename Ptr2D::elem_type elem_type;
-    typedef float index_type;
-
-    explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
-     
-    __device__ __forceinline__ elem_type operator ()(float y, float x) const
+    template <typename Ptr2D> struct PointFilter
     {
-        return src(__float2int_rn(y), __float2int_rn(x));
-    }
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
 
-    const Ptr2D src;
-};
-
-template <typename Ptr2D> struct LinearFilter
-{
-    typedef typename Ptr2D::elem_type elem_type;
-    typedef float index_type;
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+         
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rn(y), __float2int_rn(x));
+        }
 
-    explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
+        const Ptr2D src;
+    };
 
-    __device__ __forceinline__ elem_type operator ()(float y, float x) const
+    template <typename Ptr2D> struct LinearFilter
     {
-        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
 
-        work_type out = VecTraits<work_type>::all(0);
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
 
-        const int x1 = __float2int_rd(x);
-        const int y1 = __float2int_rd(y);
-        const int x2 = x1 + 1;
-        const int y2 = y1 + 1;
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
 
-        elem_type src_reg = src(y1, x1);
-        out = out + src_reg * ((x2 - x) * (y2 - y));
+            work_type out = VecTraits<work_type>::all(0);
 
-        src_reg = src(y1, x2);
-        out = out + src_reg * ((x - x1) * (y2 - y));
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
 
-        src_reg = src(y2, x1);
-        out = out + src_reg * ((x2 - x) * (y - y1));
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
 
-        src_reg = src(y2, x2);
-        out = out + src_reg * ((x - x1) * (y - y1));
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
 
-        return saturate_cast<elem_type>(out);
-    }
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
 
-    const Ptr2D src;
-};
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
 
-template <typename Ptr2D> struct CubicFilter
-{
-    typedef typename Ptr2D::elem_type elem_type;
-    typedef float index_type;
-    typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            return saturate_cast<elem_type>(out);
+        }
 
-    explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
-    
-    static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
-    {
-        return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
-    }
+        const Ptr2D src;
+    };
 
-    __device__ elem_type operator ()(float y, float x) const
+    template <typename Ptr2D> struct CubicFilter
     {
-        const int xi = __float2int_rn(x);
-        const int yi = __float2int_rn(y);
-        
-        work_type arr[4];
-        
-        arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
-        arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
-        arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
-        arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
-        
-        return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
-    }
-
-    const Ptr2D src;
-};
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
 
-END_OPENCV_DEVICE_NAMESPACE
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
+        
+        static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
+        {
+            return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const int xi = __float2int_rn(x);
+            const int yi = __float2int_rn(y);
+            
+            work_type arr[4];
+            
+            arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
+            arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
+            arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
+            arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
+            
+            return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
+        }
+
+        const Ptr2D src;
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_FILTERS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
index 49eda6f..c91ca02 100644
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@@ -47,28 +47,27 @@
 #include <cstdio>
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template<class Func> 
-void printFuncAttrib(Func& func)
+namespace cv { namespace gpu { namespace device 
 {
+    template<class Func> 
+    void printFuncAttrib(Func& func)
+    {
 
-    cudaFuncAttributes attrs;
-    cudaFuncGetAttributes(&attrs, func);  
-
-    printf("=== Function stats ===\n");
-    printf("Name: \n");
-    printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
-    printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
-    printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
-    printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
-    printf("numRegs            = %d\n", attrs.numRegs);
-    printf("ptxVersion         = %d\n", attrs.ptxVersion);
-    printf("binaryVersion      = %d\n", attrs.binaryVersion);
-    printf("\n");
-    fflush(stdout); 
-}
+        cudaFuncAttributes attrs;
+        cudaFuncGetAttributes(&attrs, func);  
 
-END_OPENCV_DEVICE_NAMESPACE
+        printf("=== Function stats ===\n");
+        printf("Name: \n");
+        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+        printf("numRegs            = %d\n", attrs.numRegs);
+        printf("ptxVersion         = %d\n", attrs.ptxVersion);
+        printf("binaryVersion      = %d\n", attrs.binaryVersion);
+        printf("\n");
+        fflush(stdout); 
+    }
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp
index d885c1a..382c71b 100644
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -49,182 +49,182 @@
 #include "vec_traits.hpp"
 #include "type_traits.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-// Function Objects
+namespace cv { namespace gpu { namespace device 
+{
+    // Function Objects
 
-using thrust::unary_function;
-using thrust::binary_function;
+    using thrust::unary_function;
+    using thrust::binary_function;
 
-// Arithmetic Operations
+    // Arithmetic Operations
 
-template <typename T> struct plus : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    template <typename T> struct plus : binary_function<T, T, T>
     {
-        return a + b;
-    }
-};
-template <typename T> struct minus : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a + b;
+        }
+    };
+    template <typename T> struct minus : binary_function<T, T, T>
     {
-        return a - b;
-    }
-};
-template <typename T> struct multiplies : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a - b;
+        }
+    };
+    template <typename T> struct multiplies : binary_function<T, T, T>
     {
-        return a * b;
-    }
-};
-template <typename T> struct divides : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a * b;
+        }
+    };
+    template <typename T> struct divides : binary_function<T, T, T>
     {
-        return a / b;
-    }
-};
-template <typename T> struct modulus : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a / b;
+        }
+    };
+    template <typename T> struct modulus : binary_function<T, T, T>
     {
-        return a % b;
-    }
-};
-template <typename T> struct negate : unary_function<T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a % b;
+        }
+    };
+    template <typename T> struct negate : unary_function<T, T>
     {
-        return -a;
-    }
-};
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return -a;
+        }
+    };
 
-// Comparison Operations
+    // Comparison Operations
 
-template <typename T> struct equal_to : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    template <typename T> struct equal_to : binary_function<T, T, bool>
     {
-        return a == b;
-    }
-};
-template <typename T> struct not_equal_to : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a == b;
+        }
+    };
+    template <typename T> struct not_equal_to : binary_function<T, T, bool>
     {
-        return a != b;
-    }
-};
-template <typename T> struct greater : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a != b;
+        }
+    };
+    template <typename T> struct greater : binary_function<T, T, bool>
     {
-        return a > b;
-    }
-};
-template <typename T> struct less : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a > b;
+        }
+    };
+    template <typename T> struct less : binary_function<T, T, bool>
     {
-        return a < b;
-    }
-};
-template <typename T> struct greater_equal : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a < b;
+        }
+    };
+    template <typename T> struct greater_equal : binary_function<T, T, bool>
     {
-        return a >= b;
-    }
-};
-template <typename T> struct less_equal : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a >= b;
+        }
+    };
+    template <typename T> struct less_equal : binary_function<T, T, bool>
     {
-        return a <= b;
-    }
-};
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a <= b;
+        }
+    };
 
-// Logical Operations
+    // Logical Operations
 
-template <typename T> struct logical_and : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    template <typename T> struct logical_and : binary_function<T, T, bool>
     {
-        return a && b;
-    }
-};
-template <typename T> struct logical_or : binary_function<T, T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a && b;
+        }
+    };
+    template <typename T> struct logical_or : binary_function<T, T, bool>
     {
-        return a || b;
-    }
-};
-template <typename T> struct logical_not : unary_function<T, bool>
-{
-    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a || b;
+        }
+    };
+    template <typename T> struct logical_not : unary_function<T, bool>
     {
-        return !a;
-    }
-};
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return !a;
+        }
+    };
 
-// Bitwise Operations
+    // Bitwise Operations
 
-template <typename T> struct bit_and : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    template <typename T> struct bit_and : binary_function<T, T, T>
     {
-        return a & b;
-    }
-};
-template <typename T> struct bit_or : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a & b;
+        }
+    };
+    template <typename T> struct bit_or : binary_function<T, T, T>
     {
-        return a | b;
-    }
-};
-template <typename T> struct bit_xor : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a | b;
+        }
+    };
+    template <typename T> struct bit_xor : binary_function<T, T, T>
     {
-        return a ^ b;
-    }
-};
-template <typename T> struct bit_not : unary_function<T, T>
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+        {
+            return a ^ b;
+        }
+    };
+    template <typename T> struct bit_not : unary_function<T, T>
     {
-        return ~v;
-    }
-};
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+        {
+            return ~v;
+        }
+    };
 
-// Generalized Identity Operations
+    // Generalized Identity Operations
 
-template <typename T> struct identity : unary_function<T, T>
-{
-    __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+    template <typename T> struct identity : unary_function<T, T>
     {
-        return x;
-    }
-};
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+        {
+            return x;
+        }
+    };
 
-template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
-{
-    __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
     {
-        return lhs;
-    }
-};
-template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
-{
-    __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        {
+            return lhs;
+        }
+    };
+    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
     {
-        return rhs;
-    }
-};
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        {
+            return rhs;
+        }
+    };
 
     // Min/Max Operations
 
@@ -234,39 +234,41 @@ template <typename T1, typename T2> struct project2nd : binary_function<T1, T2,
         __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
     };
 
-template <typename T> struct maximum : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+    template <typename T> struct maximum : binary_function<T, T, T>
     {
-        return lhs < rhs ? rhs : lhs;
-    }
-};
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
-OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
-
-template <typename T> struct minimum : binary_function<T, T, T>
-{
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
-    {
-        return lhs < rhs ? lhs : rhs;
-    }
-};
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
-OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        {
+            return lhs < rhs ? rhs : lhs;
+        }
+    };
+
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
+
+    template <typename T> struct minimum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        {
+            return lhs < rhs ? lhs : rhs;
+        }
+    };
+
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
 
 #undef OPENCV_GPU_IMPLEMENT_MINMAX
 
@@ -287,6 +289,7 @@ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
             return func(v); \
         } \
     };
+
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
     template <typename T> struct name ## _func : binary_function<T, T, float> \
     { \
@@ -303,259 +306,258 @@ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
         } \
     };
 
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
-OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
-
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
-OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
-
-#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
-#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
-
-template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
-{
-    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
-    {
-        return src1 * src1 + src2 * src2;
-    }
-};
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
+
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
+
+    #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
+    #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
+
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
+        {
+            return src1 * src1 + src2 * src2;
+        }
+    };
 
-// Saturate Cast Functor
+    // Saturate Cast Functor
 
-template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
-{
-    __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
+    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
     {
-        return saturate_cast<D>(v);
-    }
-};
-
-// Threshold Functors
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return saturate_cast<D>(v);
+        }
+    };
 
-template <typename T> struct thresh_binary_func : unary_function<T, T>
-{
-    __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+    // Threshold Functors
 
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+    template <typename T> struct thresh_binary_func : unary_function<T, T>
     {
-        return (src > thresh) * maxVal;
-    }
+        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-    const T thresh;
-    const T maxVal;
-};
-template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
-{
-    __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * maxVal;
+        }
 
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        const T thresh;
+        const T maxVal;
+    };
+    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
     {
-        return (src <= thresh) * maxVal;
-    }
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-    const T thresh;
-    const T maxVal;
-};
-template <typename T> struct thresh_trunc_func : unary_function<T, T>
-{
-    explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * maxVal;
+        }
 
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        const T thresh;
+        const T maxVal;
+    };
+    template <typename T> struct thresh_trunc_func : unary_function<T, T>
     {
-        return minimum<T>()(src, thresh);
-    }
+        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-    const T thresh;
-};
-template <typename T> struct thresh_to_zero_func : unary_function<T, T>
-{
-    explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return minimum<T>()(src, thresh);
+        }
 
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        const T thresh;
+    };
+    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
     {
-        return (src > thresh) * src;
-    }
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-    const T thresh;
-};
-template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
-{
-    explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * src;
+        }
 
-    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        const T thresh;
+    };
+    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
     {
-        return (src <= thresh) * src;
-    }
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
 
-    const T thresh;
-};
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * src;
+        }
 
-// Function Object Adaptors
-
-template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
-{
-  explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+        const T thresh;
+    };
 
-  __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
-  { 
-      return !pred(x); 
-  }
+    // Function Object Adaptors
 
-  const Predicate pred;
-};
-template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
-{
-    return unary_negate<Predicate>(pred);
-}
+    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+    {
+      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
 
-template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
-{
-    explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+      { 
+          return !pred(x); 
+      }
 
-    __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
-    { 
-        return !pred(x,y); 
+      const Predicate pred;
+    };
+    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+    {
+        return unary_negate<Predicate>(pred);
     }
 
-    const Predicate pred;
-};
-template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
-{
-    return binary_negate<BinaryPredicate>(pred);
-}
+    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+    {
+        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
 
-template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
-{
-    __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        { 
+            return !pred(x,y); 
+        }
 
-    __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+        const Predicate pred;
+    };
+    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
     {
-        return op(arg1, a);
+        return binary_negate<BinaryPredicate>(pred);
     }
 
-    const Op op;
-    const typename Op::first_argument_type arg1;
-};
-template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
-{
-    return binder1st<Op>(op, typename Op::first_argument_type(x));
-}
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
+    {
+        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
 
-template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
-{
-    __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+        {
+            return op(arg1, a);
+        }
 
-    __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+        const Op op;
+        const typename Op::first_argument_type arg1;
+    };
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
     {
-        return op(a, arg2);
+        return binder1st<Op>(op, typename Op::first_argument_type(x));
     }
 
-    const Op op;
-    const typename Op::second_argument_type arg2;
-};
-template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
-{
-    return binder2nd<Op>(op, typename Op::second_argument_type(x));
-}
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
+    {
+        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
 
-// Functor Traits
+        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+        {
+            return op(a, arg2);
+        }
 
-template <typename F> struct IsUnaryFunction
-{
-    typedef char Yes;
-    struct No {Yes a[2];};
+        const Op op;
+        const typename Op::second_argument_type arg2;
+    };
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    {
+        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+    }
 
-    template <typename T, typename D> static Yes check(unary_function<T, D>);
-    static No check(...);
+    // Functor Traits
 
-    static F makeF();
+    template <typename F> struct IsUnaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
 
-    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
-};
+        template <typename T, typename D> static Yes check(unary_function<T, D>);
+        static No check(...);
 
-template <typename F> struct IsBinaryFunction
-{
-    typedef char Yes;
-    struct No {Yes a[2];};
+        static F makeF();
 
-    template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
-    static No check(...);
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
 
-    static F makeF();
+    template <typename F> struct IsBinaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
 
-    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
-};
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+        static No check(...);
 
-namespace detail
-{
-    template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
-    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
-    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+        static F makeF();
 
-    template <typename T, typename D> struct DefaultUnaryShift
-    {
-        enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
     };
-    
-    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
-    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
-    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
 
-    template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+    namespace functional_detail
     {
-        enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
-    };
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
 
-    template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
-    template <typename Func> struct ShiftDispatcher<Func, true>
-    {
-        enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
-    };
-    template <typename Func> struct ShiftDispatcher<Func, false>
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+        
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
     {
-        enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        enum { shift = functional_detail::ShiftDispatcher<Func>::shift };
     };
-}
-
-template <typename Func> struct DefaultTransformShift
-{
-    enum { shift = detail::ShiftDispatcher<Func>::shift };
-};
 
-template <typename Func> struct DefaultTransformFunctorTraits
-{
-    enum { simple_block_dim_x = 16 };
-    enum { simple_block_dim_y = 16 };
+    template <typename Func> struct DefaultTransformFunctorTraits
+    {
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
 
-    enum { smart_block_dim_x = 16 };
-    enum { smart_block_dim_y = 16 };
-    enum { smart_shift = DefaultTransformShift<Func>::shift };
-};
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
+    };
 
-template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
 
-#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \
+#define OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(type) \
     template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
-
-END_OPENCV_DEVICE_NAMESPACE
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/limits.hpp b/modules/gpu/src/opencv2/gpu/device/limits.hpp
index b2e53e1..2559685 100644
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -43,193 +43,193 @@
 #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
 #define __OPENCV_GPU_LIMITS_GPU_HPP__
 
+#include <limits>
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template<class T> struct numeric_limits
-{
-    typedef T type;
-    __device__ __forceinline__ static type min()  { return type(); };
-    __device__ __forceinline__ static type max() { return type(); };
-    __device__ __forceinline__ static type epsilon() { return type(); }
-    __device__ __forceinline__ static type round_error() { return type(); }
-    __device__ __forceinline__ static type denorm_min()  { return type(); }
-    __device__ __forceinline__ static type infinity() { return type(); }
-    __device__ __forceinline__ static type quiet_NaN() { return type(); }
-    __device__ __forceinline__ static type signaling_NaN() { return T(); }
-    static const bool is_signed;
-};
-
-template<> struct numeric_limits<bool>
-{
-    typedef bool type;
-    __device__ __forceinline__ static type min() { return false; };
-    __device__ __forceinline__ static type max() { return true;  };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = false;
-};
-
-template<> struct numeric_limits<char>
-{
-    typedef char type;
-    __device__ __forceinline__ static type min() { return CHAR_MIN; };
-    __device__ __forceinline__ static type max() { return CHAR_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = (char)-1 == -1;
-};
-
- template<> struct numeric_limits<signed char>
-{
-    typedef char type;
-    __device__ __forceinline__ static type min() { return CHAR_MIN; };
-    __device__ __forceinline__ static type max() { return CHAR_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = (signed char)-1 == -1;
-};
-
-template<> struct numeric_limits<unsigned char>
-{
-    typedef unsigned char type;
-    __device__ __forceinline__ static type min() { return 0; };
-    __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = false;
-};
-
-template<> struct numeric_limits<short>
-{
-    typedef short type;
-    __device__ __forceinline__ static type min() { return SHRT_MIN; };
-    __device__ __forceinline__ static type max() { return SHRT_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = true;
-};
-
-template<> struct numeric_limits<unsigned short>
-{
-    typedef unsigned short type;
-    __device__ __forceinline__ static type min() { return 0; };
-    __device__ __forceinline__ static type max() { return USHRT_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = false;
-};
-
-template<> struct numeric_limits<int>
-{
-    typedef int type;
-    __device__ __forceinline__ static type min() { return INT_MIN; };
-    __device__ __forceinline__ static type max() { return INT_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = true;
-};
-
-
-template<> struct numeric_limits<unsigned int>
-{
-    typedef unsigned int type;
-    __device__ __forceinline__ static type min() { return 0; };
-    __device__ __forceinline__ static type max() { return UINT_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = false;
-};
-
-template<> struct numeric_limits<long>
-{
-    typedef long type;
-    __device__ __forceinline__ static type min() { return LONG_MIN; };
-    __device__ __forceinline__ static type max() { return LONG_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = true;
-};
-
-template<> struct numeric_limits<unsigned long>
-{
-    typedef unsigned long type;
-    __device__ __forceinline__ static type min() { return 0; };
-    __device__ __forceinline__ static type max() { return ULONG_MAX; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = false;
-};
-
-template<> struct numeric_limits<float>
-{
-    typedef float type;
-    __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-    __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-    __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = true;
-};
-
-template<> struct numeric_limits<double>
+namespace cv { namespace gpu { namespace device 
 {
-    typedef double type;
-    __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-    __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-    __device__ __forceinline__ static type epsilon();
-    __device__ __forceinline__ static type round_error();
-    __device__ __forceinline__ static type denorm_min();
-    __device__ __forceinline__ static type infinity();
-    __device__ __forceinline__ static type quiet_NaN();
-    __device__ __forceinline__ static type signaling_NaN();
-    static const bool is_signed = true;
-};
-
-END_OPENCV_DEVICE_NAMESPACE
+    template<class T> struct numeric_limits
+    {
+        typedef T type;
+        __device__ __forceinline__ static type min()  { return type(); };
+        __device__ __forceinline__ static type max() { return type(); };
+        __device__ __forceinline__ static type epsilon() { return type(); }
+        __device__ __forceinline__ static type round_error() { return type(); }
+        __device__ __forceinline__ static type denorm_min()  { return type(); }
+        __device__ __forceinline__ static type infinity() { return type(); }
+        __device__ __forceinline__ static type quiet_NaN() { return type(); }
+        __device__ __forceinline__ static type signaling_NaN() { return T(); }
+        static const bool is_signed;
+    };
+
+    template<> struct numeric_limits<bool>
+    {
+        typedef bool type;
+        __device__ __forceinline__ static type min() { return false; };
+        __device__ __forceinline__ static type max() { return true;  };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<char>
+    {
+        typedef char type;
+        __device__ __forceinline__ static type min() { return CHAR_MIN; };
+        __device__ __forceinline__ static type max() { return CHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (char)-1 == -1;
+    };
+
+    template<> struct numeric_limits<signed char>
+    {
+        typedef char type;
+        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
+        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (signed char)-1 == -1;
+    };
+
+    template<> struct numeric_limits<unsigned char>
+    {
+        typedef unsigned char type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<short>
+    {
+        typedef short type;
+        __device__ __forceinline__ static type min() { return SHRT_MIN; };
+        __device__ __forceinline__ static type max() { return SHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<unsigned short>
+    {
+        typedef unsigned short type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return USHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<int>
+    {
+        typedef int type;
+        __device__ __forceinline__ static type min() { return INT_MIN; };
+        __device__ __forceinline__ static type max() { return INT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+
+    template<> struct numeric_limits<unsigned int>
+    {
+        typedef unsigned int type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return UINT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<long>
+    {
+        typedef long type;
+        __device__ __forceinline__ static type min() { return LONG_MIN; };
+        __device__ __forceinline__ static type max() { return LONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<unsigned long>
+    {
+        typedef unsigned long type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return ULONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<float>
+    {
+        typedef float type;
+        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
+        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
+        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<double>
+    {
+        typedef double type;
+        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
+        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+}}} // namespace cv { namespace gpu { namespace device {
 
 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
index 34cbbbf..1fba68c 100644
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -45,173 +45,172 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
-template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
-
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-{ 
-    return (uchar) ::max((int)v, 0); 
-}
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-{ 
-    return (uchar) ::min((uint)v, (uint)UCHAR_MAX); 
-}
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-{ 
-    return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); 
-}
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-{ 
-    return (uchar) ::min(v, (uint)UCHAR_MAX); 
-}
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-{ 
-    return saturate_cast<uchar>((uint)v); 
-}
-
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-{ 
-    int iv = __float2int_rn(v); 
-    return saturate_cast<uchar>(iv); 
-}
-template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+namespace cv { namespace gpu { namespace device
 {
-#if __CUDA_ARCH__ >= 130
-    int iv = __double2int_rn(v); 
-    return saturate_cast<uchar>(iv);
-#else
-    return saturate_cast<uchar>((float)v);
-#endif
-}
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
 
-template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-{ 
-    return (schar) ::min((int)v, SCHAR_MAX); 
-}
-template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-{ 
-    return (schar) ::min((uint)v, (uint)SCHAR_MAX); 
-}
-template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-{
-    return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-}
-template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
-{ 
-    return saturate_cast<schar>((int)v); 
-}
-template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-{ 
-    return (schar) ::min(v, (uint)SCHAR_MAX); 
-}
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    { 
+        return (uchar) ::max((int)v, 0); 
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    { 
+        return (uchar) ::min((uint)v, (uint)UCHAR_MAX); 
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    { 
+        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); 
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    { 
+        return (uchar) ::min(v, (uint)UCHAR_MAX); 
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    { 
+        return saturate_cast<uchar>((uint)v); 
+    }
 
-template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
-{ 
-    int iv = __float2int_rn(v); 
-    return saturate_cast<schar>(iv); 
-}
-template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
-{             
-#if __CUDA_ARCH__ >= 130
-    int iv = __double2int_rn(v); 
-    return saturate_cast<schar>(iv);
-#else
-    return saturate_cast<schar>((float)v);
-#endif
-}
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    { 
+        int iv = __float2int_rn(v); 
+        return saturate_cast<uchar>(iv); 
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); 
+        return saturate_cast<uchar>(iv);
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
 
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-{ 
-    return (ushort) ::max((int)v, 0); 
-}
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-{ 
-    return (ushort) ::max((int)v, 0); 
-}
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-{ 
-    return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); 
-}
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-{ 
-    return (ushort) ::min(v, (uint)USHRT_MAX); 
-}
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
-{
-    int iv = __float2int_rn(v); 
-    return saturate_cast<ushort>(iv); 
-}
-template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-{             
-#if __CUDA_ARCH__ >= 130
-    int iv = __double2int_rn(v); 
-    return saturate_cast<ushort>(iv);
-#else
-    return saturate_cast<ushort>((float)v);
-#endif
-}
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    { 
+        return (schar) ::min((int)v, SCHAR_MAX); 
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    { 
+        return (schar) ::min((uint)v, (uint)SCHAR_MAX); 
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    { 
+        return saturate_cast<schar>((int)v); 
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    { 
+        return (schar) ::min(v, (uint)SCHAR_MAX); 
+    }
 
-template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
-{ 
-    return (short) ::min((int)v, SHRT_MAX); 
-}
-template<> __device__ __forceinline__ short saturate_cast<short>(int v)
-{
-    return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
-}
-template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
-{ 
-    return (short) ::min(v, (uint)SHRT_MAX); 
-}
-template<> __device__ __forceinline__ short saturate_cast<short>(float v)
-{ 
-    int iv = __float2int_rn(v); 
-    return saturate_cast<short>(iv); 
-}
-template<> __device__ __forceinline__ short saturate_cast<short>(double v)
-{            
-#if __CUDA_ARCH__ >= 130
-    int iv = __double2int_rn(v); 
-    return saturate_cast<short>(iv);
-#else
-    return saturate_cast<short>((float)v);
-#endif
-}
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    { 
+        int iv = __float2int_rn(v); 
+        return saturate_cast<schar>(iv); 
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {             
+    #if __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); 
+        return saturate_cast<schar>(iv);
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
 
-template<> __device__ __forceinline__ int saturate_cast<int>(float v) 
-{ 
-    return __float2int_rn(v); 
-}
-template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
-{
-#if __CUDA_ARCH__ >= 130 
-    return __double2int_rn(v);
-#else
-    return saturate_cast<int>((float)v);
-#endif
-}
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    { 
+        return (ushort) ::max((int)v, 0); 
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    { 
+        return (ushort) ::max((int)v, 0); 
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    { 
+        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); 
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    { 
+        return (ushort) ::min(v, (uint)USHRT_MAX); 
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    {
+        int iv = __float2int_rn(v); 
+        return saturate_cast<ushort>(iv); 
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {             
+    #if __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); 
+        return saturate_cast<ushort>(iv);
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    { 
+        return (short) ::min((int)v, SHRT_MAX); 
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+    { 
+        return (short) ::min(v, (uint)SHRT_MAX); 
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+    { 
+        int iv = __float2int_rn(v); 
+        return saturate_cast<short>(iv); 
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+    {            
+    #if __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); 
+        return saturate_cast<short>(iv);
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
 
-template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
-{ 
-    return __float2uint_rn(v); 
-}
-template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
-{            
-#if __CUDA_ARCH__ >= 130
-    return __double2uint_rn(v);
-#else
-    return saturate_cast<uint>((float)v);
-#endif
-}
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v) 
+    { 
+        return __float2int_rn(v); 
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
+    {
+    #if __CUDA_ARCH__ >= 130 
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
 
-END_OPENCV_DEVICE_NAMESPACE
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+    { 
+        return __float2uint_rn(v); 
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
+    {            
+    #if __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
+    }
+}}}
 
 #endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/static_check.hpp b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
index 28da9a9..7f6aafe 100644
--- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
@@ -49,24 +49,21 @@
     #define __OPENCV_GPU_HOST_DEVICE__
 #endif  
 
-namespace cv
-{
-    namespace gpu
+namespace cv { namespace gpu 
+{ 
+    namespace device
     {
-        namespace device
-        {
-            template<bool expr> struct Static {};
-            
-            template<> struct Static<true> 
-            { 
-                __OPENCV_GPU_HOST_DEVICE__ static void check() {}; 
-            };
-        }    
+        template<bool expr> struct Static {};
+        
+        template<> struct Static<true> 
+        { 
+            __OPENCV_GPU_HOST_DEVICE__ static void check() {}; 
+        };
+    }    
 
-        using cv::gpu::device::Static;
-    }
-}
+    using ::cv::gpu::device::Static;
+}}
 
-#undef __PCL_GPU_HOST_DEVICE__
+#undef __OPENCV_GPU_HOST_DEVICE__
 
 #endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ 
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index 81f427d..e1d033f 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -47,30 +47,31 @@
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T, typename D, typename UnOp>
-void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
-{
-    detail::transform_caller(src, dst, op, WithOutMask(), stream);
-}
-template <typename T, typename D, typename UnOp>
-void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
+namespace cv { namespace gpu { namespace device 
 {
-    detail::transform_caller(src, dst, op, SingleMask(mask), stream);
-}
+    template <typename T, typename D, typename UnOp>
+    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
+    {
+        transform_detail::transform_caller(src, dst, op, WithOutMask(), stream);
+    }
 
-template <typename T1, typename T2, typename D, typename BinOp>
-void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
-{
-    detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
-}
-template <typename T1, typename T2, typename D, typename BinOp>
-void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
-{
-    detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
-}
+    template <typename T, typename D, typename UnOp>
+    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
+    {
+        transform_detail::transform_caller(src, dst, op, SingleMask(mask), stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp>
+    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
+    {
+        transform_detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
+    }
 
-END_OPENCV_DEVICE_NAMESPACE
+    template <typename T1, typename T2, typename D, typename BinOp>
+    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
+    {
+        transform_detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
+    }
+}}}
 
 #endif // __OPENCV_GPU_TRANSFORM_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
index 9553d22..2dbecfb 100644
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -46,37 +46,38 @@
 #include "internal_shared.hpp"
 #include "detail/type_traits_detail.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T> struct IsSimpleParameter
-{
-    enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
-};
-
-template <typename T> struct TypeTraits
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename detail::UnConst<T>::type                                       NonConstType;
-    typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;
-    typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;
-    typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;
-    typedef typename detail::ReferenceTraits<T>::type                               ReferredType;
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value || 
+            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
+    };
 
-    enum { isConst          = detail::UnConst<T>::value };
-    enum { isVolatile       = detail::UnVolatile<T>::value };
+    template <typename T> struct TypeTraits
+    {
+        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
+        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
+        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
+        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
+        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
 
-    enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };
-    enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        
+        enum { isConst          = type_traits_detail::UnConst<T>::value };
+        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
 
-    enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
-    enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };
-    enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };
-    enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };
-    enum { isArith       = isIntegral || isFloat };
-    enum { isVec         = detail::IsVec<UnqualifiedType>::value  };
-    
-    typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
-};
+        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };        
 
-END_OPENCV_DEVICE_NAMESPACE
+        enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt   = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral    = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat       = type_traits_detail::IsFloat<UnqualifiedType>::value  };
+        enum { isArith       = isIntegral || isFloat };
+        enum { isVec         = type_traits_detail::IsVec<UnqualifiedType>::value  };
+        
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value, 
+            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
 
 #endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index 0acdd78..0c417a7 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -48,168 +48,167 @@
 #include "datamov_utils.hpp"
 #include "detail/utility_detail.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define OPENCV_GPU_LOG_WARP_SIZE	    (5)
-#define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
-#define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
-#define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
-
-///////////////////////////////////////////////////////////////////////////////
-// swap
-
-template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
+namespace cv { namespace gpu { namespace device 
 {
-    const T temp = a;
-    a = b;
-    b = temp;
-}
+    #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
+    #define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
+    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
 
-///////////////////////////////////////////////////////////////////////////////
-// Mask Reader
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
 
-struct SingleMask
-{
-    explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
-    
-    __device__ __forceinline__ bool operator()(int y, int x) const
-    {            
-        return mask.ptr(y)[x] != 0;
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
+    {
+        const T temp = a;
+        a = b;
+        b = temp;
     }
 
-    const PtrStepb mask;
-};
-
-struct MaskCollection
-{
-    explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
 
-    __device__ __forceinline__ void next()
-    {
-        curMask = *maskCollection++;
-    }
-    __device__ __forceinline__ void setMask(int z)
+    struct SingleMask
     {
-        curMask = maskCollection[z];
-    }
-    
-    __device__ __forceinline__ bool operator()(int y, int x) const
-    {
-        uchar val;
-        return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
-    }
+        explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
+        
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {            
+            return mask.ptr(y)[x] != 0;
+        }
 
-    const PtrStepb* maskCollection;
-    PtrStepb curMask;
-};
+        const PtrStepb mask;
+    };
 
-struct WithOutMask
-{
-    __device__ __forceinline__ void next() const
+    struct MaskCollection
     {
-    }
-    __device__ __forceinline__ void setMask(int) const
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ void next()
+        {
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+        
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
+        }
+
+        const PtrStepb* maskCollection;
+        PtrStepb curMask;
+    };
+
+    struct WithOutMask
     {
-    }
-
-    __device__ __forceinline__ bool operator()(int, int) const
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
+
+        __device__ __forceinline__ bool operator()(int, int) const
+        {
+            return true;
+        }
+
+        __device__ __forceinline__ bool operator()(int, int, int) const
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int)
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int, int)
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Reduction
+
+    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
     {
-        return true;
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
     }
 
-    __device__ __forceinline__ bool operator()(int, int, int) const
+    template <int n, typename T, typename V, typename Pred> 
+    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
     {
-        return true;
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
     }
 
-    static __device__ __forceinline__ bool check(int, int)
+    template <int n, typename T, typename V1, typename V2, typename Pred> 
+    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
     {
-        return true;
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
     }
-
-    static __device__ __forceinline__ bool check(int, int, int)
-    {
-        return true;
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Reduction
-
-template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-{
-    StaticAssert<n >= 8 && n <= 512>::check();
-    detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-}
-
-template <int n, typename T, typename V, typename Pred> 
-__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-{
-    StaticAssert<n >= 8 && n <= 512>::check();
-    detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-}
-
-template <int n, typename T, typename V1, typename V2, typename Pred> 
-__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-{
-    StaticAssert<n >= 8 && n <= 512>::check();
-    detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-}
     
-///////////////////////////////////////////////////////////////////////////////
-// Solve linear system
-
-// solve 2x2 linear system Ax=b
-template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
-{
-    T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
 
-    if (det != 0)
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
     {
-        double invdet = 1.0 / det;
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
 
-        x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
 
-        x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
 
-        return true;
-    }
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
 
-    return false;
-}
+            return true;
+        }
 
-// solve 3x3 linear system Ax=b
-template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
-{
-    T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
-          - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
-          + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+        return false;
+    }
 
-    if (det != 0)
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
     {
-        double invdet = 1.0 / det;
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
 
-        x[0] = saturate_cast<T>(invdet * 
-            (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
-             A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
-             A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
 
-        x[1] = saturate_cast<T>(invdet * 
-            (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
-             b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
-             A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+            x[0] = saturate_cast<T>(invdet * 
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
 
-        x[2] = saturate_cast<T>(invdet * 
-            (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
-             A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
-             b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+            x[1] = saturate_cast<T>(invdet * 
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
 
-        return true;
-    }
+            x[2] = saturate_cast<T>(invdet * 
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
 
-    return false;
-}
+            return true;
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        return false;
+    }
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_UTILITY_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
index 3b84fa5..a27e425 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -48,179 +48,178 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T> struct L1Dist
+namespace cv { namespace gpu { namespace device 
 {
-    typedef int value_type;
-    typedef int result_type;
-
-    __device__ __forceinline__ L1Dist() : mySum(0) {}
-
-    __device__ __forceinline__ void reduceIter(int val1, int val2)
+    template <typename T> struct L1Dist
     {
-        mySum = __sad(val1, val2, mySum);
-    }
+        typedef int value_type;
+        typedef int result_type;
 
-    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-    {
-        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-    }
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
 
-    __device__ __forceinline__ operator int() const
-    {
-        return mySum;
-    }
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
 
-    int mySum;
-};
-template <> struct L1Dist<float>
-{
-    typedef float value_type;
-    typedef float result_type;
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
 
-    __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
 
-    __device__ __forceinline__ void reduceIter(float val1, float val2)
+        int mySum;
+    };
+    template <> struct L1Dist<float>
     {
-        mySum += ::fabs(val1 - val2);
-    }
+        typedef float value_type;
+        typedef float result_type;
 
-    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-    {
-        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-    }
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
 
-    __device__ __forceinline__ operator float() const
-    {
-        return mySum;
-    }
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
 
-    float mySum;
-};
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
 
-struct L2Dist
-{
-    typedef float value_type;
-    typedef float result_type;
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
 
-    __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+        float mySum;
+    };
 
-    __device__ __forceinline__ void reduceIter(float val1, float val2)
+    struct L2Dist
     {
-        float reg = val1 - val2;
-        mySum += reg * reg;
-    }
+        typedef float value_type;
+        typedef float result_type;
 
-    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-    {
-        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-    }
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
 
-    __device__ __forceinline__ operator float() const
-    {
-        return sqrtf(mySum);
-    }
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
 
-    float mySum;
-};
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
 
-struct HammingDist
-{
-    typedef int value_type;
-    typedef int result_type;
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
 
-    __device__ __forceinline__ HammingDist() : mySum(0) {}
+        float mySum;
+    };
 
-    __device__ __forceinline__ void reduceIter(int val1, int val2)
+    struct HammingDist
     {
-        mySum += __popc(val1 ^ val2);
-    }
+        typedef int value_type;
+        typedef int result_type;
 
-    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-    {
-        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-    }
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
 
-    __device__ __forceinline__ operator int() const
-    {
-        return mySum;
-    }
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
 
-    int mySum;
-};
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
 
-// calc distance between two vectors in global memory
-template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
-__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-{
-    for (int i = tid; i < len; i += THREAD_DIM)
-    {
-        T1 val1;
-        ForceGlob<T1>::Load(vec1, i, val1);
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
 
-        T2 val2;
-        ForceGlob<T2>::Load(vec2, i, val2);
+        int mySum;
+    };
 
-        dist.reduceIter(val1, val2);
-    }
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
 
-    dist.reduceAll<THREAD_DIM>(smem, tid);
-}
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
 
-// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
-template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
-__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-{        
-    detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-    
-    dist.reduceAll<THREAD_DIM>(smem, tid);
-}
+            dist.reduceIter(val1, val2);
+        }
 
-// calc distance between two vectors in global memory
-template <int THREAD_DIM, typename T1> struct VecDiffGlobal
-{
-    explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
-    {
-        vec1 = vec1_;
+        dist.reduceAll<THREAD_DIM>(smem, tid);
     }
 
-    template <typename T2, typename Dist>
-    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-    {
-        calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {        
+        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+        
+        dist.reduceAll<THREAD_DIM>(smem, tid);
     }
 
-    const T1* vec1;
-};
-
-// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
-template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
-{
-    template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
     {
-        if (glob_tid < len)
-            smem[glob_tid] = vec1[glob_tid];
-        __syncthreads();
-
-        U* vec1ValsPtr = vec1Vals;
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
 
-        #pragma unroll
-        for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
-            *vec1ValsPtr++ = smem[i];
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
 
-        __syncthreads();
-    }
+        const T1* vec1;
+    };
 
-    template <typename T2, typename Dist>
-    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
     {
-        calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
-    }
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
 
-    U vec1Vals[MAX_LEN / THREAD_DIM];
-};
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
 
-END_OPENCV_DEVICE_NAMESPACE
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
index 8fda041..88e8909 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -48,85 +48,85 @@
 #include "vec_traits.hpp"
 #include "functional.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace detail
+namespace cv { namespace gpu { namespace device 
 {
-    template <int cn, typename VecD> struct SatCastHelper;
-    template <typename VecD> struct SatCastHelper<1, VecD>
+    namespace vec_math_detail
     {
-        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        template <int cn, typename VecD> struct SatCastHelper;
+        template <typename VecD> struct SatCastHelper<1, VecD>
         {
-            typedef typename VecTraits<VecD>::elem_type D;
-            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-        }
-    };
-    template <typename VecD> struct SatCastHelper<2, VecD>
-    {
-        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<2, VecD>
         {
-            typedef typename VecTraits<VecD>::elem_type D;
-            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-        }
-    };
-    template <typename VecD> struct SatCastHelper<3, VecD>
-    {
-        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<3, VecD>
         {
-            typedef typename VecTraits<VecD>::elem_type D;
-            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-        }
-    };
-    template <typename VecD> struct SatCastHelper<4, VecD>
-    {
-        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<4, VecD>
         {
-            typedef typename VecTraits<VecD>::elem_type D;
-            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-        }
-    };
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+            }
+        };
 
-    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
-    {
-        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
+        {
+            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+        }
     }
-}
 
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
 
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
 
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
 
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
 
 #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
     __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
@@ -150,49 +150,49 @@ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
     }
 
-namespace detail
-{    
-    template <typename T1, typename T2> struct BinOpTraits
-    {
-        typedef int argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, T>
-    {
-        typedef T argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<double, T>
-    {
-        typedef double argument_type;
-    };
-    template <> struct BinOpTraits<double, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, float>
-    {
-        typedef float argument_type;
-    };
-    template <typename T> struct BinOpTraits<float, T>
-    {
-        typedef float argument_type;
-    };
-    template <> struct BinOpTraits<float, float>
-    {
-        typedef float argument_type;
-    };
-    template <> struct BinOpTraits<double, float>
-    {
-        typedef double argument_type;
-    };
-    template <> struct BinOpTraits<float, double>
-    {
-        typedef double argument_type;
-    };
-}
+    namespace vec_math_detail
+    {    
+        template <typename T1, typename T2> struct BinOpTraits
+        {
+            typedef int argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, T>
+        {
+            typedef T argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<double, T>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<double, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, float>
+        {
+            typedef float argument_type;
+        };
+        template <typename T> struct BinOpTraits<float, T>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<float, float>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<double, float>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<float, double>
+        {
+            typedef double argument_type;
+        };
+    }
 
 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
     __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
@@ -201,16 +201,16 @@ namespace detail
         return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
     } \
     __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
     { \
@@ -218,16 +218,16 @@ namespace detail
         return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
     } \
     __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
     { \
@@ -235,16 +235,16 @@ namespace detail
         return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
     } \
     __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
     { \
@@ -252,16 +252,16 @@ namespace detail
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
     { \
-        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
     } \
     template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
     { \
-        func<typename detail::BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
     }
 
 #define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
@@ -313,20 +313,19 @@ namespace detail
     OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
 
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
-OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
-OPENCV_GPU_IMPLEMENT_VEC_OP(float)
-OPENCV_GPU_IMPLEMENT_VEC_OP(double)
-
-#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
-#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
-#undef OPENCV_GPU_IMPLEMENT_VEC_OP
-#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
 
-END_OPENCV_DEVICE_NAMESPACE
+    #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
+}}} // namespace cv { namespace gpu { namespace device
         
 #endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
index 979da2a..dd304ed 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -45,82 +45,82 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template<typename T, int N> struct TypeVec;
-
-struct __align__(8) uchar8
-{
-    uchar a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
-{
-    uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(8) char8
-{
-    schar a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
-{
-    char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(16) ushort8
-{
-    ushort a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
-{
-    ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(16) short8
-{
-    short a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
-{
-    short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(32) uint8
+namespace cv { namespace gpu { namespace device 
 {
-    uint a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
-{
-    uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(32) int8
-{
-    int a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
-{
-    int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct __align__(32) float8
-{
-    float a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
-{
-    float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
-struct double8
-{
-    double a0, a1, a2, a3, a4, a5, a6, a7;
-};
-static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
-{
-    double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-    return val;
-}
+    template<typename T, int N> struct TypeVec;
+
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
 
 #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
     template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
@@ -134,28 +134,28 @@ static __host__ __device__ __forceinline__ double8 make_double8(double a0, doubl
     template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
     template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
 
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
-OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
 
-#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
+    #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
 
-template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
-template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
-template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
-template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
-template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
 
-template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
-template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
-template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
-template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
-template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
 
     template<typename T> struct VecTraits;
 
@@ -209,73 +209,72 @@ template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
         static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
     };
 
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
-OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
 
-#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
 
-template<> struct VecTraits<char> 
-{ 
-    typedef char elem_type; 
-    enum {cn=1}; 
-    static __device__ __host__ __forceinline__ char all(char v) {return v;}
-    static __device__ __host__ __forceinline__ char make(char x) {return x;}
-    static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
-};
-template<> struct VecTraits<schar> 
-{ 
-    typedef schar elem_type; 
-    enum {cn=1}; 
-    static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
-    static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
-    static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
-};
-template<> struct VecTraits<char1>
-{
-    typedef schar elem_type;
-    enum {cn=1};
-    static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
-    static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
-    static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
-};
-template<> struct VecTraits<char2>
-{
-    typedef schar elem_type;
-    enum {cn=2};
-    static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
-    static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
-    static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
-};
-template<> struct VecTraits<char3>
-{
-    typedef schar elem_type;
-    enum {cn=3};
-    static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
-    static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
-    static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
-};
-template<> struct VecTraits<char4>
-{
-    typedef schar elem_type;
-    enum {cn=4};
-    static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
-    static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
-    static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
-};
-template<> struct VecTraits<char8>
-{
-    typedef schar elem_type;
-    enum {cn=8};
-    static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
-    static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
-    static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
-};
-
-END_OPENCV_DEVICE_NAMESPACE
+    template<> struct VecTraits<char> 
+    { 
+        typedef char elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
+    template<> struct VecTraits<schar> 
+    { 
+        typedef schar elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_VEC_TRAITS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/warp.hpp b/modules/gpu/src/opencv2/gpu/device/warp.hpp
index 9abe6e4..78e1a34 100644
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@@ -45,71 +45,70 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-struct Warp
+namespace cv { namespace gpu { namespace device 
 {
-    enum
-    {
-        LOG_WARP_SIZE = 5,
-        WARP_SIZE     = 1 << LOG_WARP_SIZE,
-        STRIDE        = WARP_SIZE
-    };
-
-    /** \brief Returns the warp lane ID of the calling thread. */
-    static __device__ __forceinline__ unsigned int laneId()
+    struct Warp
     {
-        unsigned int ret;
-        asm("mov.u32 %0, %laneid;" : "=r"(ret) );
-        return ret;
-    }
+        enum
+        {
+            LOG_WARP_SIZE = 5,
+            WARP_SIZE     = 1 << LOG_WARP_SIZE,
+            STRIDE        = WARP_SIZE
+        };
 
-    template<typename It, typename T>
-    static __device__ __forceinline__ void fill(It beg, It end, const T& value)
-    {                
-        for(It t = beg + laneId(); t < end; t += STRIDE)
-            *t = value;
-    }            
+        /** \brief Returns the warp lane ID of the calling thread. */
+        static __device__ __forceinline__ unsigned int laneId()
+        {
+            unsigned int ret;
+            asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+            return ret;
+        }
 
-    template<typename InIt, typename OutIt>
-    static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
-    {                
-        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-            *out = *t;
-        return out;
-    }            
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {                
+            for(It t = beg + laneId(); t < end; t += STRIDE)
+                *t = value;
+        }            
 
-    template<typename InIt, typename OutIt, class UnOp>
-    static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
-    {
-        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-            *out = op(*t);
-        return out;
-    }
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+        {                
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = *t;
+            return out;
+        }            
 
-    template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
-    static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
-    {
-        unsigned int lane = laneId();
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = op(*t);
+            return out;
+        }
 
-        InIt1 t1 = beg1 + lane; 
-        InIt2 t2 = beg2 + lane;
-        for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
-            *out = op(*t1, *t2);
-        return out;
-    }
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            unsigned int lane = laneId();
 
-    template<typename OutIt, typename T>
-    static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
-    {
-        unsigned int lane = laneId();                
-        value += lane;
+            InIt1 t1 = beg1 + lane; 
+            InIt2 t2 = beg2 + lane;
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+                *out = op(*t1, *t2);
+            return out;
+        }
 
-        for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
-            *t = value;
-    }
-};
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            unsigned int lane = laneId();                
+            value += lane;
 
-END_OPENCV_DEVICE_NAMESPACE
+            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+    };
+}}} // namespace cv { namespace gpu { namespace device
 
 #endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
index 91602df..f3ff01c 100644
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
@@ -46,27 +46,26 @@
 
 #include "internal_shared.hpp"
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-		
-template <class T> 
-__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
-{
-    const unsigned int lane = tid & 31; // index of thread in warp (0..31)
-				
-	if (lane < 16)
-	{				
-		T partial = ptr[tid];
+namespace cv { namespace gpu { namespace device 
+{		
+    template <class T> 
+    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
+    {
+        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+    				
+	    if (lane < 16)
+	    {				
+		    T partial = ptr[tid];
 
-		ptr[tid] = partial = partial + ptr[tid + 16];
-		ptr[tid] = partial = partial + ptr[tid + 8];
-		ptr[tid] = partial = partial + ptr[tid + 4];
-		ptr[tid] = partial = partial + ptr[tid + 2];
-		ptr[tid] = partial = partial + ptr[tid + 1];            
-	}
+		    ptr[tid] = partial = partial + ptr[tid + 16];
+		    ptr[tid] = partial = partial + ptr[tid + 8];
+		    ptr[tid] = partial = partial + ptr[tid + 4];
+		    ptr[tid] = partial = partial + ptr[tid + 2];
+		    ptr[tid] = partial = partial + ptr[tid + 1];            
+	    }
 
-	return ptr[tid - lane];
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+	    return ptr[tid - lane];
+    }
+}}} // namespace cv { namespace gpu { namespace device {
 
 #endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/split_merge.cpp b/modules/gpu/src/split_merge.cpp
index 4d05ee9..0b9f971 100644
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -55,21 +55,20 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace split_merge 
-{    
-    void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-    void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    namespace split_merge 
+    {    
+        void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+    }
+}}}
 
 namespace
 {
     void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) 
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
+        using namespace ::cv::gpu::device::split_merge;
 
         CV_Assert(src);
         CV_Assert(n > 0);
@@ -108,7 +107,7 @@ namespace
 
     void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) 
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
+        using namespace ::cv::gpu::device::split_merge;
 
         CV_Assert(dst);
 
diff --git a/modules/gpu/src/stereobm.cpp b/modules/gpu/src/stereobm.cpp
index a23c914..f1ad920 100644
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
@@ -55,16 +55,15 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobm
+namespace cv { namespace gpu { namespace device 
 {
-    void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);
-    void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
-    void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace stereobm
+    {
+        void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+        void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
+    }
+}}}
 
 const float defaultAvgTexThreshold = 3;
 
@@ -99,7 +98,7 @@ namespace
 {
     void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
     {
-        using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;
+        using namespace ::cv::gpu::device::stereobm;
 
         CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
         CV_DbgAssert(left.type() == CV_8UC1);
diff --git a/modules/gpu/src/stereobp.cpp b/modules/gpu/src/stereobp.cpp
index 7cc960a..2e2eb43 100644
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@@ -59,28 +59,27 @@ void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobp
+namespace cv { namespace gpu { namespace device 
 {
-    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
-    template<typename T, typename D>
-    void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
-    template<typename T>
-    void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T>
-    void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-    template <typename T>
-    void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, 
-        const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-    template <typename T>
-    void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, 
-        const DevMem2D_<short>& disp, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
-
-using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;
+    namespace stereobp
+    {
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
+        template<typename T, typename D>
+        void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, 
+            const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+        template <typename T>
+        void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, 
+            const DevMem2D_<short>& disp, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::device::stereobp;
 
 namespace
 {
diff --git a/modules/gpu/src/stereocsbp.cpp b/modules/gpu/src/stereocsbp.cpp
index a0c8a7f..912a71b 100644
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@@ -57,40 +57,39 @@ void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, Gp
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereocsbp
+namespace cv { namespace gpu { namespace device 
 {
-    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-        const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
-
-    template<class T>
-    void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-    template<class T>
-    void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-    template<class T>
-    void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                      T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-    template<class T>
-    void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-        const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-    template<class T> 
-    void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    namespace stereocsbp
+    {
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
+            const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
+
+        template<class T>
+        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template<class T>
+        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+
+        template<class T>
+        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template<class T>
+        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
+
+        template<class T> 
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;
+using namespace ::cv::gpu::device::stereocsbp;
 
 namespace
 {
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index 9d2d37f..b833312 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -63,35 +63,34 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace surf
+namespace cv { namespace gpu { namespace device 
 {
-    void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
-    void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+    namespace surf
+    {
+        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
 
-    void bindImgTex(DevMem2Db img);
-    void bindSumTex(DevMem2D_<unsigned int> sum);
-    void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);
+        void bindImgTex(DevMem2Db img);
+        void bindSumTex(DevMem2D_<unsigned int> sum);
+        void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);
 
-    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
 
-    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
-        int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
+        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
 
-    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
-        float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
-        unsigned int* featureCounter);
+        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
+            float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
+            unsigned int* featureCounter);
 
-    void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
 
-    void compute_descriptors_gpu(const DevMem2Df& descriptors, 
-        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+        void compute_descriptors_gpu(const DevMem2Df& descriptors, 
+            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+    }
+}}}
 
-using namespace OPENCV_DEVICE_NAMESPACE_ surf;
+using namespace ::cv::gpu::device::surf;
 
 namespace
 {
diff --git a/modules/gpu/test/test_video.cpp b/modules/gpu/test/test_video.cpp
index f643290..fe9d784 100644
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@@ -225,7 +225,7 @@ TEST_P(InterpolateFrames, Regression)
 
 #ifndef DUMP
 
-    EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-4);
+    EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-3);
 
 #else
 
-- 
2.7.4