From 0465b89e7ecbb9339babe79a8050f555479def29 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Mon, 20 Dec 2010 09:07:19 +0000
Subject: [PATCH] gpu module refactoring: moved per-element operations into
 separated file

---
 modules/gpu/include/opencv2/gpu/gpu.hpp    | 148 +++----
 modules/gpu/src/arithm.cpp                 | 530 -------------------------
 modules/gpu/src/cuda/element_operations.cu | 348 +++++++++++++++++
 modules/gpu/src/cuda/mathfunc.cu           | 330 ++--------------
 modules/gpu/src/element_operations.cpp     | 605 ++++++++++++++++++++++++++++-
 tests/gpu/src/bitwise_oper.cpp             |   2 +-
 6 files changed, 1048 insertions(+), 915 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index a8329da..4827f85 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -362,46 +362,10 @@ namespace cv
 
         ////////////////////////////// Arithmetics ///////////////////////////////////
 
-        //! adds one matrix to another (c = a + b)
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
-        CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c);
-        //! adds scalar to a matrix (c = a + s)
-        //! supports CV_32FC1 and CV_32FC2 type
-        CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c);
-        //! subtracts one matrix from another (c = a - b)
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
-        CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c);
-        //! subtracts scalar from a matrix (c = a - s)
-        //! supports CV_32FC1 and CV_32FC2 type
-        CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c);
-        //! computes element-wise product of the two arrays (c = a * b)
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
-        CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c);
-        //! multiplies matrix to a scalar (c = a * s)
-        //! supports CV_32FC1 and CV_32FC2 type
-        CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
-        CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c);
-        //! computes element-wise quotient of matrix and scalar (c = a / s)
-        //! supports CV_32FC1 and CV_32FC2 type
-        CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c);
-
         //! transposes the matrix
         //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
         CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);
 
-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
-        CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c);
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-        //! supports only CV_32FC1 type
-        CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c);
-
-        //! compares elements of two arrays (c = a <cmpop> b)
-        //! supports CV_8UC4, CV_32FC1 types
-        CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop);
-
         //! computes mean value and standard deviation of all or selected array elements
         //! supports only CV_8UC1 type
         CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
@@ -485,14 +449,6 @@ namespace cv
         //! copies each plane of a multi-channel array to a dedicated array (async version)
         CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream);
 
-        //! computes exponent of each matrix element (b = e**a)
-        //! supports only CV_32FC1 type
-        CV_EXPORTS void exp(const GpuMat& a, GpuMat& b);
-
-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-        //! supports only CV_32FC1 type
-        CV_EXPORTS void log(const GpuMat& a, GpuMat& b);
-
         //! computes magnitude of complex (x(i).re, x(i).im) vector
         //! supports only CV_32FC2 type
         CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude);
@@ -531,33 +487,6 @@ namespace cv
         //! async version
         CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream);
 
-
-        //! perfroms per-elements bit-wise inversion
-        CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat());
-        //! async version
-        CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream);
-
-        //! calculates per-element bit-wise disjunction of two arrays
-        CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
-        //! async version
-        CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
-
-        //! calculates per-element bit-wise conjunction of two arrays
-        CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
-        //! async version
-        CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
-
-        //! calculates per-element bit-wise "exclusive or" operation
-        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
-        //! async version
-        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
-
-        //! Logical operators
-        CV_EXPORTS GpuMat operator ~ (const GpuMat& src);
-        CV_EXPORTS GpuMat operator | (const GpuMat& src1, const GpuMat& src2);
-        CV_EXPORTS GpuMat operator & (const GpuMat& src1, const GpuMat& src2);
-        CV_EXPORTS GpuMat operator ^ (const GpuMat& src1, const GpuMat& src2);
-
         //! computes per-element minimum of two arrays (dst = min(src1, src2))
         CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
         //! Async version
@@ -578,6 +507,83 @@ namespace cv
         //! Async version
         CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
 
+        //////////////////////////// Per-element operations ////////////////////////////////////
+
+        //! adds one matrix to another (c = a + b)
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
+        CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c);
+        //! adds scalar to a matrix (c = a + s)
+        //! supports CV_32FC1 and CV_32FC2 type
+        CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c);
+
+        //! subtracts one matrix from another (c = a - b)
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
+        CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c);
+        //! subtracts scalar from a matrix (c = a - s)
+        //! supports CV_32FC1 and CV_32FC2 type
+        CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c);
+
+        //! computes element-wise product of the two arrays (c = a * b)
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
+        CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c);
+        //! multiplies matrix to a scalar (c = a * s)
+        //! supports CV_32FC1 and CV_32FC2 type
+        CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c);
+
+        //! computes element-wise quotient of the two arrays (c = a / b)
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
+        CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c);
+        //! computes element-wise quotient of matrix and scalar (c = a / s)
+        //! supports CV_32FC1 and CV_32FC2 type
+        CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c);
+
+        //! computes exponent of each matrix element (b = e**a)
+        //! supports only CV_32FC1 type
+        CV_EXPORTS void exp(const GpuMat& a, GpuMat& b);
+
+        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
+        //! supports only CV_32FC1 type
+        CV_EXPORTS void log(const GpuMat& a, GpuMat& b);
+
+        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
+        CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c);
+        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
+        //! supports only CV_32FC1 type
+        CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c);
+
+        //! compares elements of two arrays (c = a <cmpop> b)
+        //! supports CV_8UC4, CV_32FC1 types
+        CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop);
+
+        //! performs per-elements bit-wise inversion
+        CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat());
+        //! version without mask
+        CV_EXPORTS GpuMat operator ~ (const GpuMat& src);
+        //! async version
+        CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream);
+
+        //! calculates per-element bit-wise disjunction of two arrays
+        CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
+        //! version without mask
+        CV_EXPORTS GpuMat operator | (const GpuMat& src1, const GpuMat& src2);
+        //! async version
+        CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
+
+        //! calculates per-element bit-wise conjunction of two arrays
+        CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
+        //! version without mask
+        CV_EXPORTS GpuMat operator & (const GpuMat& src1, const GpuMat& src2);
+        //! async version
+        CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
+
+        //! calculates per-element bit-wise "exclusive or" operation
+        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat());
+        //! version without mask
+        CV_EXPORTS GpuMat operator ^ (const GpuMat& src1, const GpuMat& src2);
+        //! async version
+        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
+
 
         ////////////////////////////// Image processing //////////////////////////////
 
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 8a7abb8..56ba525 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -48,18 +48,7 @@ using namespace std;
 
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }
 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
@@ -89,18 +78,6 @@ void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool)
 void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
-void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
-cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuMat(); }
-cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
-cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
-cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
 void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
 void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
@@ -113,164 +90,6 @@ void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu()
 #else /* !defined (HAVE_CUDA) */
 
 ////////////////////////////////////////////////////////////////////////
-// add subtract multiply divide
-
-namespace
-{
-    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,
-                                         NppiSize oSizeROI, int nScaleFactor);
-    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
-    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
-
-    void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,
-                         npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,
-                         npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)
-    {
-        CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
-        CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
-
-        dst.create( src1.size(), src1.type() );
-
-        NppiSize sz;
-        sz.width  = src1.cols;
-        sz.height = src1.rows;
-
-        switch (src1.type())
-        {
-        case CV_8UC1:
-            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
-            break;
-        case CV_8UC4:
-            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
-            break;
-        case CV_32SC1:
-            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,
-                src2.ptr<Npp32s>(), src2.step,
-                dst.ptr<Npp32s>(), dst.step, sz) );
-            break;
-        case CV_32FC1:
-            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,
-                src2.ptr<Npp32f>(), src2.step,
-                dst.ptr<Npp32f>(), dst.step, sz) );
-            break;
-        default:
-            CV_Assert(!"Unsupported source type");
-        }
-    }
-
-    template<int SCN> struct NppArithmScalarFunc;
-    template<> struct NppArithmScalarFunc<1>
-    {
-        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
-    };
-    template<> struct NppArithmScalarFunc<2>
-    {
-        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
-    };
-
-    template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;
-    template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>
-    {
-        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-        {
-            dst.create(src.size(), src.type());
-
-            NppiSize sz;
-            sz.width  = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
-        }
-    };
-    template<typename NppArithmScalarFunc<2>::func_ptr func> struct NppArithmScalar<2, func>
-    {
-        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-        {
-            dst.create(src.size(), src.type());
-
-            NppiSize sz;
-            sz.width  = src.cols;
-            sz.height = src.rows;
-
-            Npp32fc nValue;
-            nValue.re = (Npp32f)sc[0];
-            nValue.im = (Npp32f)sc[1];
-
-            nppSafeCall( func(src.ptr<Npp32fc>(), src.step, nValue, dst.ptr<Npp32fc>(), dst.step, sz) );
-        }
-    };
-}
-
-void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
-{
-    nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R);
-}
-
-void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
-{
-    nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R);
-}
-
-void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
-{
-    nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R);
-}
-
-void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
-{
-    nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R);
-}
-
-void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-{
-    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
-    static const caller_t callers[] = {0, NppArithmScalar<1, nppiAddC_32f_C1R>::calc, NppArithmScalar<2, nppiAddC_32fc_C1R>::calc};
-
-    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
-
-    callers[src.channels()](src, sc, dst);
-}
-
-void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-{
-    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
-    static const caller_t callers[] = {0, NppArithmScalar<1, nppiSubC_32f_C1R>::calc, NppArithmScalar<2, nppiSubC_32fc_C1R>::calc};
-
-    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
-
-    callers[src.channels()](src, sc, dst);
-}
-
-void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-{
-    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
-    static const caller_t callers[] = {0, NppArithmScalar<1, nppiMulC_32f_C1R>::calc, NppArithmScalar<2, nppiMulC_32fc_C1R>::calc};
-
-    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
-
-    callers[src.channels()](src, sc, dst);
-}
-
-void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst)
-{
-    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
-    static const caller_t callers[] = {0, NppArithmScalar<1, nppiDivC_32f_C1R>::calc, NppArithmScalar<2, nppiDivC_32fc_C1R>::calc};
-
-    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
-
-    callers[src.channels()](src, sc, dst);
-}
-
-////////////////////////////////////////////////////////////////////////
 // transpose
 
 namespace cv { namespace gpu { namespace mathfunc
@@ -300,112 +119,6 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
 }
 
 ////////////////////////////////////////////////////////////////////////
-// absdiff
-
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
-{
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
-    CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
-
-    dst.create( src1.size(), src1.type() );
-
-    NppiSize sz;
-    sz.width  = src1.cols;
-    sz.height = src1.rows;
-
-    switch (src1.type())
-    {
-    case CV_8UC1:
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
-        break;
-    case CV_8UC4:
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
-        break;
-    case CV_32SC1:
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,
-            src2.ptr<Npp32s>(), src2.step,
-            dst.ptr<Npp32s>(), dst.step, sz) );
-        break;
-    case CV_32FC1:
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,
-            src2.ptr<Npp32f>(), src2.step,
-            dst.ptr<Npp32f>(), dst.step, sz) );
-        break;
-    default:
-        CV_Assert(!"Unsupported source type");
-    }
-}
-
-void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
-{
-    CV_Assert(src.type() == CV_32FC1);
-
-    dst.create( src.size(), src.type() );
-
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-
-    nppSafeCall( nppiAbsDiffC_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, (Npp32f)s[0]) );
-}
-
-////////////////////////////////////////////////////////////////////////
-// compare
-
-namespace cv { namespace gpu { namespace mathfunc
-{
-    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
-    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
-}}}
-
-void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop)
-{
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
-    CV_Assert(src1.type() == CV_8UC4 || src1.type() == CV_32FC1);
-
-    dst.create( src1.size(), CV_8UC1 );
-
-    static const NppCmpOp nppCmpOp[] = { NPP_CMP_EQ, NPP_CMP_GREATER, NPP_CMP_GREATER_EQ, NPP_CMP_LESS, NPP_CMP_LESS_EQ };
-
-    NppiSize sz;
-    sz.width  = src1.cols;
-    sz.height = src1.rows;
-
-    if (src1.type() == CV_8UC4)
-    {
-        if (cmpop != CMP_NE)
-        {
-            nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
-        }
-        else
-        {
-            mathfunc::compare_ne_8uc4(src1, src2, dst);
-        }
-    }
-    else
-    {
-        if (cmpop != CMP_NE)
-        {
-            nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), src1.step,
-                src2.ptr<Npp32f>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
-        }
-        else
-        {
-            mathfunc::compare_ne_32f(src1, src2, dst);
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////
 // meanStdDev
 
 void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
@@ -997,249 +710,6 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
     ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// Per-element bit-wise logical matrix operations
-
-namespace cv { namespace gpu { namespace mathfunc
-{
-    void bitwise_not_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream);
-
-    void bitwise_or_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
-
-    void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
-
-    void bitwise_xor_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
-}}}
-
-namespace
-{
-    void bitwise_not_caller(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src.size(), src.type());
-
-        cv::gpu::mathfunc::bitwise_not_caller(src.rows, src.cols, src.elemSize1(), 
-                                              dst.channels(), src, dst, stream);
-    }
-
-
-    void bitwise_not_caller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu;
-
-        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwise_mask_not_caller<unsigned char>, mathfunc::bitwise_mask_not_caller<unsigned char>, 
-                                   mathfunc::bitwise_mask_not_caller<unsigned short>, mathfunc::bitwise_mask_not_caller<unsigned short>,
-                                   mathfunc::bitwise_mask_not_caller<unsigned int>, mathfunc::bitwise_mask_not_caller<unsigned int>,
-                                   mathfunc::bitwise_mask_not_caller<unsigned int>};
-
-        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
-        dst.create(src.size(), src.type());
-
-        Caller caller = callers[src.depth()];
-        CV_Assert(caller);
-
-        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(src.rows, src.cols, cn, src, mask, dst, stream);
-    }
-
-
-    void bitwise_or_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::mathfunc::bitwise_or_caller(dst.rows, dst.cols, dst.elemSize1(), 
-                                             dst.channels(), src1, src2, dst, stream);
-    }
-
-
-    void bitwise_or_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu;
-
-        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwise_mask_or_caller<unsigned char>, mathfunc::bitwise_mask_or_caller<unsigned char>, 
-                                   mathfunc::bitwise_mask_or_caller<unsigned short>, mathfunc::bitwise_mask_or_caller<unsigned short>,
-                                   mathfunc::bitwise_mask_or_caller<unsigned int>, mathfunc::bitwise_mask_or_caller<unsigned int>,
-                                   mathfunc::bitwise_mask_or_caller<unsigned int>};
-
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        Caller caller = callers[src1.depth()];
-        CV_Assert(caller);
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
-    }
-
-
-    void bitwise_and_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::mathfunc::bitwise_and_caller(dst.rows, dst.cols, dst.elemSize1(), 
-                                              dst.channels(), src1, src2, dst, stream);
-    }
-
-
-    void bitwise_and_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu;
-
-        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwise_mask_and_caller<unsigned char>, mathfunc::bitwise_mask_and_caller<unsigned char>, 
-                                   mathfunc::bitwise_mask_and_caller<unsigned short>, mathfunc::bitwise_mask_and_caller<unsigned short>,
-                                   mathfunc::bitwise_mask_and_caller<unsigned int>, mathfunc::bitwise_mask_and_caller<unsigned int>,
-                                   mathfunc::bitwise_mask_and_caller<unsigned int>};
-
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        Caller caller = callers[src1.depth()];
-        CV_Assert(caller);
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
-    }
-
-
-    void bitwise_xor_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::mathfunc::bitwise_xor_caller(dst.rows, dst.cols, dst.elemSize1(), 
-                                              dst.channels(), src1, src2, dst, stream);
-    }
-
-
-    void bitwise_xor_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu;
-
-        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwise_mask_xor_caller<unsigned char>, mathfunc::bitwise_mask_xor_caller<unsigned char>, 
-                                   mathfunc::bitwise_mask_xor_caller<unsigned short>, mathfunc::bitwise_mask_xor_caller<unsigned short>,
-                                   mathfunc::bitwise_mask_xor_caller<unsigned int>, mathfunc::bitwise_mask_xor_caller<unsigned int>,
-                                   mathfunc::bitwise_mask_xor_caller<unsigned int>};
-
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-
-        Caller caller = callers[src1.depth()];
-        CV_Assert(caller);
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
-    }
-}
-
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask)
-{
-    if (mask.empty())
-        ::bitwise_not_caller(src, dst, 0);
-    else
-        ::bitwise_not_caller(src, dst, mask, 0);
-}
-
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream)
-{
-    if (mask.empty())
-        ::bitwise_not_caller(src, dst, StreamAccessor::getStream(stream));
-    else
-        ::bitwise_not_caller(src, dst, mask, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
-{
-    if (mask.empty())
-        ::bitwise_or_caller(src1, src2, dst, 0);
-    else
-        ::bitwise_or_caller(src1, src2, dst, mask, 0);
-}
-
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
-{
-    if (mask.empty())
-        ::bitwise_or_caller(src1, src2, dst, StreamAccessor::getStream(stream));
-    else
-        ::bitwise_or_caller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
-{
-    if (mask.empty())
-        ::bitwise_and_caller(src1, src2, dst, 0);
-    else
-        ::bitwise_and_caller(src1, src2, dst, mask, 0);
-}
-
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
-{
-    if (mask.empty())
-        ::bitwise_and_caller(src1, src2, dst, StreamAccessor::getStream(stream));
-    else
-        ::bitwise_and_caller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
-{
-    if (mask.empty())
-        ::bitwise_xor_caller(src1, src2, dst, 0);
-    else
-        ::bitwise_xor_caller(src1, src2, dst, mask, 0);
-}
-
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
-{
-    if (mask.empty())
-        ::bitwise_xor_caller(src1, src2, dst, StreamAccessor::getStream(stream));
-    else
-        ::bitwise_xor_caller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
-
-}
-
-cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat& src)
-{
-    GpuMat dst;
-    bitwise_not(src, dst);
-    return dst;
-}
-
-cv::gpu::GpuMat cv::gpu::operator | (const GpuMat& src1, const GpuMat& src2)
-{
-    GpuMat dst;
-    bitwise_or(src1, src2, dst);
-    return dst;
-}
-
-cv::gpu::GpuMat cv::gpu::operator & (const GpuMat& src1, const GpuMat& src2)
-{
-    GpuMat dst;
-    bitwise_and(src1, src2, dst);
-    return dst;
-}
-
-cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat& src1, const GpuMat& src2)
-{
-    GpuMat dst;
-    bitwise_xor(src1, src2, dst);
-    return dst;
-}
 
 //////////////////////////////////////////////////////////////////////////////
 // min/max
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index e69de29..ba9d011 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -0,0 +1,348 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/vecmath.hpp"
+#include "transform.hpp"
+#include "internal_shared.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Compare
+
+    template <typename T1, typename T2>
+    struct NotEqual
+    {
+        __device__ uchar operator()(const T1& src1, const T2& src2)
+        {
+            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
+        }
+    };
+
+    template <typename T1, typename T2>
+    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        NotEqual<T1, T2> op;
+        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
+    }
+
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<uint, uint>(src1, src2, dst);
+    }
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<float, float>(src1, src2, dst);
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // Unary bitwise logical matrix operations
+
+    enum { UN_OP_NOT };
+
+    template <typename T, int opid>
+    struct UnOp;
+
+    template <typename T>
+    struct UnOp<T, UN_OP_NOT>
+    { 
+        static __device__ T call(T v) { return ~v; }
+    };
+
+
+    template <int opid>
+    __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStep src, PtrStep dst)
+    {
+        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (y < rows) 
+        {
+            uchar* dst_ptr = dst.ptr(y) + x;
+            const uchar* src_ptr = src.ptr(y) + x;
+            if (x + sizeof(uint) - 1 < width)
+            {
+                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
+            }
+            else
+            {
+                const uchar* src_end = src.ptr(y) + width;
+                while (src_ptr < src_end)
+                {
+                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
+                }
+            }
+        }
+    }
+
+
+    template <int opid>
+    void bitwiseUnOp(int rows, int width, const PtrStep src, PtrStep dst, 
+                     cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(width, threads.x * sizeof(uint)), 
+                  divUp(rows, threads.y));
+
+        bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
+
+        if (stream == 0) 
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T, int opid>
+    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStep src, 
+                                      const PtrStep mask, PtrStep dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        {
+            T* dst_row = (T*)dst.ptr(y);
+            const T* src_row = (const T*)src.ptr(y);
+
+            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
+        }
+    }
+
+
+    template <typename T, int opid>
+    void bitwiseUnOp(int rows, int cols, int cn, const PtrStep src, 
+                     const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
+
+        if (stream == 0) 
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    void bitwiseNotCaller(int rows, int cols, int elem_size1, int cn, 
+                          const PtrStep src, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseUnOp<UN_OP_NOT>(rows, cols * elem_size1 * cn, src, dst, stream);
+    }
+
+
+    template <typename T>
+    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStep src, 
+                              const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
+    }
+
+    template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // Binary bitwise logical matrix operations
+
+    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
+
+    template <typename T, int opid>
+    struct BinOp;
+
+    template <typename T>
+    struct BinOp<T, BIN_OP_OR>
+    { 
+        static __device__ T call(T a, T b) { return a | b; } 
+    };
+
+
+    template <typename T>
+    struct BinOp<T, BIN_OP_AND>
+    { 
+        static __device__ T call(T a, T b) { return a & b; } 
+    };
+
+    template <typename T>
+    struct BinOp<T, BIN_OP_XOR>
+    { 
+        static __device__ T call(T a, T b) { return a ^ b; } 
+    };
+
+
+    template <int opid>
+    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStep src1, 
+                                       const PtrStep src2, PtrStep dst)
+    {
+        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (y < rows) 
+        {
+            uchar* dst_ptr = dst.ptr(y) + x;
+            const uchar* src1_ptr = src1.ptr(y) + x;
+            const uchar* src2_ptr = src2.ptr(y) + x;
+
+            if (x + sizeof(uint) - 1 < width)
+            {
+                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
+            }
+            else
+            {
+                const uchar* src1_end = src1.ptr(y) + width;
+                while (src1_ptr < src1_end)
+                {
+                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
+                }
+            }
+        }
+    }
+
+
+    template <int opid>
+    void bitwiseBinOp(int rows, int width, const PtrStep src1, const PtrStep src2, 
+                      PtrStep dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
+
+        bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
+
+        if (stream == 0) 
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T, int opid>
+    __global__ void bitwiseBinOpKernel(
+            int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
+            const PtrStep mask, PtrStep dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        {
+            T* dst_row = (T*)dst.ptr(y);
+            const T* src1_row = (const T*)src1.ptr(y);
+            const T* src2_row = (const T*)src2.ptr(y);
+
+            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
+        }
+    }
+
+
+    template <typename T, int opid>
+    void bitwiseBinOp(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
+                        const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        dim3 threads(16, 16);
+        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+        bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst); 
+
+        if (stream == 0) 
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    void bitwiseOrCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+                         const PtrStep src2, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_OR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+    }
+
+
+    template <typename T>
+    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
+                             const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
+
+    template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+
+
+    void bitwiseAndCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+                          const PtrStep src2, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_AND>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+    }
+
+
+    template <typename T>
+    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
+                              const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
+
+    template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+
+
+    void bitwiseXorCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+                          const PtrStep src2, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<BIN_OP_XOR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+    }
+
+
+    template <typename T>
+    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
+                              const PtrStep mask, PtrStep dst, cudaStream_t stream)
+    {
+        bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+    }
+
+    template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+
+}}}
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 387d49d..fe5a0e2 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -78,6 +78,29 @@ namespace cv { namespace gpu { namespace mathfunc
         }
     }
 
+
+    struct Mask8U
+    {
+        explicit Mask8U(PtrStep mask): mask(mask) {}
+
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return mask.ptr(y)[x]; 
+        }
+
+        PtrStep mask;
+    };
+
+
+    struct MaskTrue 
+    { 
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return true; 
+        } 
+    };
+
+
     struct Nothing
     {
         static __device__ void calc(int, int, float, float, float*, size_t, float)
@@ -235,313 +258,6 @@ namespace cv { namespace gpu { namespace mathfunc
         callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
     }
 
-//////////////////////////////////////////////////////////////////////////////////////
-// Compare
-
-    template <typename T1, typename T2>
-    struct NotEqual
-    {
-        __device__ uchar operator()(const T1& src1, const T2& src2)
-        {
-            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
-        }
-    };
-
-    template <typename T1, typename T2>
-    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        NotEqual<T1, T2> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
-    }
-
-    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        compare_ne<uint, uint>(src1, src2, dst);
-    }
-    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        compare_ne<float, float>(src1, src2, dst);
-    }
-
-
-//////////////////////////////////////////////////////////////////////////////
-// Per-element bit-wise logical matrix operations
-
-    struct Mask8U
-    {
-        explicit Mask8U(PtrStep mask): mask(mask) {}
-
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return mask.ptr(y)[x]; 
-        }
-
-        PtrStep mask;
-    };
-
-
-    struct MaskTrue 
-    { 
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return true; 
-        } 
-    };
-
-    //------------------------------------------------------------------------
-    // Unary operations
-
-    enum { UN_OP_NOT };
-
-    template <typename T, int opid>
-    struct UnOp;
-
-    template <typename T>
-    struct UnOp<T, UN_OP_NOT>
-    { 
-        static __device__ T call(T v) { return ~v; }
-    };
-
-
-    template <int opid>
-    __global__ void bitwise_un_op_kernel(int rows, int width, const PtrStep src, PtrStep dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows) 
-        {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src_ptr = src.ptr(y) + x;
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
-            }
-            else
-            {
-                const uchar* src_end = src.ptr(y) + width;
-                while (src_ptr < src_end)
-                {
-                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
-                }
-            }
-        }
-    }
-
-
-    template <int opid>
-    void bitwise_un_op(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), 
-                  divUp(rows, threads.y));
-
-        bitwise_un_op_kernel<opid><<<grid, threads>>>(rows, width, src, dst);
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwise_un_op_kernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
-        {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src_row = (const T*)src.ptr(y);
-
-            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
-        }
-    }
-
-
-    template <typename T, int opid>
-    void bitwise_un_op(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwise_un_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    void bitwise_not_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_un_op<UN_OP_NOT>(rows, cols * elem_size1 * cn, src, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_un_op<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
-    }
-
-    template void bitwise_mask_not_caller<uchar>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_not_caller<ushort>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_not_caller<uint>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-    //------------------------------------------------------------------------
-    // Binary operations
-
-    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
-
-    template <typename T, int opid>
-    struct BinOp;
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_OR>
-    { 
-        static __device__ T call(T a, T b) { return a | b; } 
-    };
-
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_AND>
-    { 
-        static __device__ T call(T a, T b) { return a & b; } 
-    };
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_XOR>
-    { 
-        static __device__ T call(T a, T b) { return a ^ b; } 
-    };
-
-
-    template <int opid>
-    __global__ void bitwise_bin_op_kernel(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows) 
-        {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src1_ptr = src1.ptr(y) + x;
-            const uchar* src2_ptr = src2.ptr(y) + x;
-
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
-            }
-            else
-            {
-                const uchar* src1_end = src1.ptr(y) + width;
-                while (src1_ptr < src1_end)
-                {
-                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
-                }
-            }
-        }
-    }
-
-
-    template <int opid>
-    void bitwise_bin_op(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst, 
-                        cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
-
-        bitwise_bin_op_kernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwise_bin_op_kernel(
-            int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
-            const PtrStep mask, PtrStep dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
-        {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src1_row = (const T*)src1.ptr(y);
-            const T* src2_row = (const T*)src2.ptr(y);
-
-            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
-        }
-    }
-
-
-    template <typename T, int opid>
-    void bitwise_bin_op(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
-                        const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwise_bin_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst); 
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    void bitwise_or_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_OR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_or_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_or_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_or_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-
-    void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_AND>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_and_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_and_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_and_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-
-    void bitwise_xor_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_XOR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_xor_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_xor_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_xor_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-
 
 //////////////////////////////////////////////////////////////////////////////
 // Min max
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index ea3f832..5d802bd 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -1,16 +1,609 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 
+#include "precomp.hpp"
 
+using namespace cv;
+using namespace cv::gpu;
 
+#if !defined (HAVE_CUDA)
 
+void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); }
+void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
+cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuMat(); }
+cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
 
+#else
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////  Unary per-element operations /////////////////////////////////////////
-// operation(GpuMat src, GpuMat dst)
+////////////////////////////////////////////////////////////////////////
+// Basic arithmetical operations (add subtract multiply divide)
 
+namespace
+{
+    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,
+                                         NppiSize oSizeROI, int nScaleFactor);
+    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,
+                                          int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,
+                                          int nDstStep, NppiSize oSizeROI);
 
+    void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,
+                         npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,
+                         npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)
+    {
+        CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
 
+        CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////  Binary per-element operations ////////////////////////////////////////
-// operation(GpuMat src1, GpuMat src2, GpuMat dst)
+        dst.create( src1.size(), src1.type() );
+
+        NppiSize sz;
+        sz.width  = src1.cols;
+        sz.height = src1.rows;
+
+        switch (src1.type())
+        {
+        case CV_8UC1:
+            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,
+                src2.ptr<Npp8u>(), src2.step,
+                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            break;
+        case CV_8UC4:
+            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,
+                src2.ptr<Npp8u>(), src2.step,
+                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            break;
+        case CV_32SC1:
+            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,
+                src2.ptr<Npp32s>(), src2.step,
+                dst.ptr<Npp32s>(), dst.step, sz) );
+            break;
+        case CV_32FC1:
+            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,
+                src2.ptr<Npp32f>(), src2.step,
+                dst.ptr<Npp32f>(), dst.step, sz) );
+            break;
+        default:
+            CV_Assert(!"Unsupported source type");
+        }
+    }
+
+    template<int SCN> struct NppArithmScalarFunc;
+    template<> struct NppArithmScalarFunc<1>
+    {
+        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,
+                                      int nDstStep, NppiSize oSizeROI);
+    };
+    template<> struct NppArithmScalarFunc<2>
+    {
+        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,
+                                      int nDstStep, NppiSize oSizeROI);
+    };
+
+    template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;
+    template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>
+    {
+        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+        {
+            dst.create(src.size(), src.type());
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
+        }
+    };
+    template<typename NppArithmScalarFunc<2>::func_ptr func> struct NppArithmScalar<2, func>
+    {
+        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+        {
+            dst.create(src.size(), src.type());
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            Npp32fc nValue;
+            nValue.re = (Npp32f)sc[0];
+            nValue.im = (Npp32f)sc[1];
+
+            nppSafeCall( func(src.ptr<Npp32fc>(), src.step, nValue, dst.ptr<Npp32fc>(), dst.step, sz) );
+        }
+    };
+}
+
+void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
+{
+    nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R);
+}
+
+void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
+{
+    nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R);
+}
+
+void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
+{
+    nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R);
+}
+
+void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
+{
+    nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R);
+}
+
+void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+{
+    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
+    static const caller_t callers[] = {0, NppArithmScalar<1, nppiAddC_32f_C1R>::calc, NppArithmScalar<2, nppiAddC_32fc_C1R>::calc};
+
+    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
+
+    callers[src.channels()](src, sc, dst);
+}
+
+void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+{
+    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
+    static const caller_t callers[] = {0, NppArithmScalar<1, nppiSubC_32f_C1R>::calc, NppArithmScalar<2, nppiSubC_32fc_C1R>::calc};
+
+    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
+
+    callers[src.channels()](src, sc, dst);
+}
+
+void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+{
+    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
+    static const caller_t callers[] = {0, NppArithmScalar<1, nppiMulC_32f_C1R>::calc, NppArithmScalar<2, nppiMulC_32fc_C1R>::calc};
+
+    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
+
+    callers[src.channels()](src, sc, dst);
+}
+
+void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst)
+{
+    typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst);
+    static const caller_t callers[] = {0, NppArithmScalar<1, nppiDivC_32f_C1R>::calc, NppArithmScalar<2, nppiDivC_32fc_C1R>::calc};
+
+    CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
+
+    callers[src.channels()](src, sc, dst);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Absolute difference
+
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
+{
+    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
+
+    CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
+
+    dst.create( src1.size(), src1.type() );
+
+    NppiSize sz;
+    sz.width  = src1.cols;
+    sz.height = src1.rows;
+
+    switch (src1.type())
+    {
+    case CV_8UC1:
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,
+            src2.ptr<Npp8u>(), src2.step,
+            dst.ptr<Npp8u>(), dst.step, sz) );
+        break;
+    case CV_8UC4:
+        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,
+            src2.ptr<Npp8u>(), src2.step,
+            dst.ptr<Npp8u>(), dst.step, sz) );
+        break;
+    case CV_32SC1:
+        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,
+            src2.ptr<Npp32s>(), src2.step,
+            dst.ptr<Npp32s>(), dst.step, sz) );
+        break;
+    case CV_32FC1:
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,
+            src2.ptr<Npp32f>(), src2.step,
+            dst.ptr<Npp32f>(), dst.step, sz) );
+        break;
+    default:
+        CV_Assert(!"Unsupported source type");
+    }
+}
+
+void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
+{
+    CV_Assert(src.type() == CV_32FC1);
+
+    dst.create( src.size(), src.type() );
+
+    NppiSize sz;
+    sz.width  = src.cols;
+    sz.height = src.rows;
+
+    nppSafeCall( nppiAbsDiffC_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, (Npp32f)s[0]) );
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Comparison of two matrixes
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
+}}}
+
+void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop)
+{
+    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
+
+    CV_Assert(src1.type() == CV_8UC4 || src1.type() == CV_32FC1);
+
+    dst.create( src1.size(), CV_8UC1 );
+
+    static const NppCmpOp nppCmpOp[] = { NPP_CMP_EQ, NPP_CMP_GREATER, NPP_CMP_GREATER_EQ, NPP_CMP_LESS, NPP_CMP_LESS_EQ };
+
+    NppiSize sz;
+    sz.width  = src1.cols;
+    sz.height = src1.rows;
+
+    if (src1.type() == CV_8UC4)
+    {
+        if (cmpop != CMP_NE)
+        {
+            nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), src1.step,
+                src2.ptr<Npp8u>(), src2.step,
+                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
+        }
+        else
+        {
+            mathfunc::compare_ne_8uc4(src1, src2, dst);
+        }
+    }
+    else
+    {
+        if (cmpop != CMP_NE)
+        {
+            nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), src1.step,
+                src2.ptr<Npp32f>(), src2.step,
+                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
+        }
+        else
+        {
+            mathfunc::compare_ne_32f(src1, src2, dst);
+        }
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Unary bitwise logical operations
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+    void bitwiseNotCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
+
+    template <typename T>
+    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream);
+}}}
+
+namespace
+{
+    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
+    {
+        dst.create(src.size(), src.type());
+
+        cv::gpu::mathfunc::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
+                                              dst.channels(), src, dst, stream);
+    }
+
+
+    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        using namespace cv::gpu;
+
+        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+        static Caller callers[] = {mathfunc::bitwiseMaskNotCaller<unsigned char>, mathfunc::bitwiseMaskNotCaller<unsigned char>, 
+                                   mathfunc::bitwiseMaskNotCaller<unsigned short>, mathfunc::bitwiseMaskNotCaller<unsigned short>,
+                                   mathfunc::bitwiseMaskNotCaller<unsigned int>, mathfunc::bitwiseMaskNotCaller<unsigned int>,
+                                   mathfunc::bitwiseMaskNotCaller<unsigned int>};
+
+        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
+        dst.create(src.size(), src.type());
+
+        Caller caller = callers[src.depth()];
+        CV_Assert(caller);
+
+        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
+        caller(src.rows, src.cols, cn, src, mask, dst, stream);
+    }
+
+}
+
+
+void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask)
+{
+    if (mask.empty())
+        ::bitwiseNotCaller(src, dst, 0);
+    else
+        ::bitwiseNotCaller(src, dst, mask, 0);
+}
+
+
+void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream)
+{
+    if (mask.empty())
+        ::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
+    else
+        ::bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
+}
+
+
+cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat& src)
+{
+    GpuMat dst;
+    bitwise_not(src, dst);
+    return dst;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Binary bitwise logical operations
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+    void bitwiseOrCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+
+    template <typename T>
+    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
+
+    void bitwiseAndCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+
+    template <typename T>
+    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
+
+    void bitwiseXorCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+
+    template <typename T>
+    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
+}}}
+
+
+namespace
+{
+    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        cv::gpu::mathfunc::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
+                                             dst.channels(), src1, src2, dst, stream);
+    }
+
+
+    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        using namespace cv::gpu;
+
+        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+        static Caller callers[] = {mathfunc::bitwiseMaskOrCaller<unsigned char>, mathfunc::bitwiseMaskOrCaller<unsigned char>, 
+                                   mathfunc::bitwiseMaskOrCaller<unsigned short>, mathfunc::bitwiseMaskOrCaller<unsigned short>,
+                                   mathfunc::bitwiseMaskOrCaller<unsigned int>, mathfunc::bitwiseMaskOrCaller<unsigned int>,
+                                   mathfunc::bitwiseMaskOrCaller<unsigned int>};
+
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        Caller caller = callers[src1.depth()];
+        CV_Assert(caller);
+
+        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
+        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+    }
+
+
+    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        cv::gpu::mathfunc::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
+                                              dst.channels(), src1, src2, dst, stream);
+    }
+
+
+    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        using namespace cv::gpu;
+
+        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+        static Caller callers[] = {mathfunc::bitwiseMaskAndCaller<unsigned char>, mathfunc::bitwiseMaskAndCaller<unsigned char>, 
+                                   mathfunc::bitwiseMaskAndCaller<unsigned short>, mathfunc::bitwiseMaskAndCaller<unsigned short>,
+                                   mathfunc::bitwiseMaskAndCaller<unsigned int>, mathfunc::bitwiseMaskAndCaller<unsigned int>,
+                                   mathfunc::bitwiseMaskAndCaller<unsigned int>};
+
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        Caller caller = callers[src1.depth()];
+        CV_Assert(caller);
+
+        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
+        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+    }
+
+
+    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        cv::gpu::mathfunc::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
+                                              dst.channels(), src1, src2, dst, stream);
+    }
+
+
+    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        using namespace cv::gpu;
+
+        typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+        static Caller callers[] = {mathfunc::bitwiseMaskXorCaller<unsigned char>, mathfunc::bitwiseMaskXorCaller<unsigned char>, 
+                                   mathfunc::bitwiseMaskXorCaller<unsigned short>, mathfunc::bitwiseMaskXorCaller<unsigned short>,
+                                   mathfunc::bitwiseMaskXorCaller<unsigned int>, mathfunc::bitwiseMaskXorCaller<unsigned int>,
+                                   mathfunc::bitwiseMaskXorCaller<unsigned int>};
+
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+
+        Caller caller = callers[src1.depth()];
+        CV_Assert(caller);
+
+        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
+        caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+    }
+}
+
+
+void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
+{
+    if (mask.empty())
+        ::bitwiseOrCaller(src1, src2, dst, 0);
+    else
+        ::bitwiseOrCaller(src1, src2, dst, mask, 0);
+}
+
+
+void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
+{
+    if (mask.empty())
+        ::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    else
+        ::bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
+{
+    if (mask.empty())
+        ::bitwiseAndCaller(src1, src2, dst, 0);
+    else
+        ::bitwiseAndCaller(src1, src2, dst, mask, 0);
+}
+
+
+void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
+{
+    if (mask.empty())
+        ::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    else
+        ::bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask)
+{
+    if (mask.empty())
+        ::bitwiseXorCaller(src1, src2, dst, 0);
+    else
+        ::bitwiseXorCaller(src1, src2, dst, mask, 0);
+}
+
+
+void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
+{
+    if (mask.empty())
+        ::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    else
+        ::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+}
+
+
+cv::gpu::GpuMat cv::gpu::operator | (const GpuMat& src1, const GpuMat& src2)
+{
+    GpuMat dst;
+    bitwise_or(src1, src2, dst);
+    return dst;
+}
+
+
+cv::gpu::GpuMat cv::gpu::operator & (const GpuMat& src1, const GpuMat& src2)
+{
+    GpuMat dst;
+    bitwise_and(src1, src2, dst);
+    return dst;
+}
+
+
+cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat& src1, const GpuMat& src2)
+{
+    GpuMat dst;
+    bitwise_xor(src1, src2, dst);
+    return dst;
+}
+
+#endif
\ No newline at end of file
diff --git a/tests/gpu/src/bitwise_oper.cpp b/tests/gpu/src/bitwise_oper.cpp
index 9e52e76..d391ff9 100644
--- a/tests/gpu/src/bitwise_oper.cpp
+++ b/tests/gpu/src/bitwise_oper.cpp
@@ -53,7 +53,7 @@ using namespace std;
 
 struct CV_GpuBitwiseTest: public CvTest
 {
-    CV_GpuBitwiseTest(): CvTest("GPU-BitwiseOpers", "bitwiseMatOperators") {}
+    CV_GpuBitwiseTest(): CvTest("GPU-BitwiseOpersTest", "bitwiseMatOperators") {}
 
     void run(int)
     {
-- 
2.7.4