From 844bdea5acc7c2ac4739447e6efb924e2de1176b Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 19 Mar 2012 14:18:12 +0000
Subject: [PATCH] fixed several bugs in gpu arithm functions refactored tests
 for them

---
 modules/gpu/include/opencv2/gpu/gpu.hpp    |    4 +-
 modules/gpu/src/cuda/element_operations.cu |  114 +-
 modules/gpu/src/element_operations.cpp     |  240 ++--
 modules/gpu/test/test_arithm.cpp           | 1992 +++++++++++++++++++---------
 modules/gpu/test/utility.hpp               |   31 +-
 5 files changed, 1601 insertions(+), 780 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 98c838b..e377063 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -638,11 +638,11 @@ CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, S
 
 //! pixel by pixel right shift of an image by a constant value
 //! supports 1, 3 and 4 channels images with integers elements
-CV_EXPORTS void rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
 
 //! pixel by pixel left shift of an image by a constant value
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
 
 //! computes per-element minimum of two arrays (dst = min(src1, src2))
 CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 27136d5..f5e8459 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -47,7 +47,7 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     //////////////////////////////////////////////////////////////////////////
     // add
@@ -684,7 +684,7 @@ namespace cv { namespace gpu { namespace device
         __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
         {
             return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) 
+                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
                           : make_uchar4(0,0,0,0);
         }
     };
@@ -706,8 +706,8 @@ namespace cv { namespace gpu { namespace device
     {
         __device__ __forceinline__ short4 operator ()(short4 a, float b) const
         {
-            return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                        saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))
+            return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
+                                        saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
                           : make_short4(0,0,0,0);
         }
     };
@@ -1106,10 +1106,10 @@ namespace cv { namespace gpu { namespace device
 
     //template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
     template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
     template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
-    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
     template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
 
     //////////////////////////////////////////////////////////////////////////////////////
@@ -1251,7 +1251,7 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T>
     struct UnOp<T, UN_OP_NOT>
-    { 
+    {
         static __device__ __forceinline__ T call(T v) { return ~v; }
     };
 
@@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device
         const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
         const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        if (y < rows) 
+        if (y < rows)
         {
             uchar* dst_ptr = dst.ptr(y) + x;
             const uchar* src_ptr = src.ptr(y) + x;
@@ -1283,29 +1283,29 @@ namespace cv { namespace gpu { namespace device
 
 
     template <int opid>
-    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, 
+    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
                      cudaStream_t stream)
     {
         dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), 
+        dim3 grid(divUp(width, threads.x * sizeof(uint)),
                   divUp(rows, threads.y));
 
         bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
         cudaSafeCall( cudaGetLastError() );
 
-        if (stream == 0) 
+        if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
     template <typename T, int opid>
-    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, 
+    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
                                       const PtrStepb mask, PtrStepb dst)
     {
         const int x = blockDim.x * blockIdx.x + threadIdx.x;
         const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        if (x < cols && y < rows && mask.ptr(y)[x / cn])
         {
             T* dst_row = (T*)dst.ptr(y);
             const T* src_row = (const T*)src.ptr(y);
@@ -1316,21 +1316,21 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T, int opid>
-    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, 
+    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
                      const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         dim3 threads(16, 16);
         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
+        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
         cudaSafeCall( cudaGetLastError() );
 
-        if (stream == 0) 
+        if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, 
+    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
                           const PtrStepb src, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
@@ -1338,7 +1338,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, 
+    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
                               const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
@@ -1359,32 +1359,32 @@ namespace cv { namespace gpu { namespace device
 
     template <typename T>
     struct BinOp<T, BIN_OP_OR>
-    { 
-        static __device__ __forceinline__ T call(T a, T b) { return a | b; } 
+    {
+        static __device__ __forceinline__ T call(T a, T b) { return a | b; }
     };
 
 
     template <typename T>
     struct BinOp<T, BIN_OP_AND>
-    { 
-        static __device__ __forceinline__ T call(T a, T b) { return a & b; } 
+    {
+        static __device__ __forceinline__ T call(T a, T b) { return a & b; }
     };
 
     template <typename T>
     struct BinOp<T, BIN_OP_XOR>
-    { 
-        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } 
+    {
+        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
     };
 
 
     template <int opid>
-    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, 
+    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
                                        const PtrStepb src2, PtrStepb dst)
     {
         const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
         const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        if (y < rows) 
+        if (y < rows)
         {
             uchar* dst_ptr = dst.ptr(y) + x;
             const uchar* src1_ptr = src1.ptr(y) + x;
@@ -1407,7 +1407,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <int opid>
-    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, 
+    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
                       PtrStepb dst, cudaStream_t stream)
     {
         dim3 threads(16, 16);
@@ -1416,20 +1416,20 @@ namespace cv { namespace gpu { namespace device
         bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
         cudaSafeCall( cudaGetLastError() );
 
-        if (stream == 0) 
+        if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
     template <typename T, int opid>
     __global__ void bitwiseBinOpKernel(
-            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
             const PtrStepb mask, PtrStepb dst)
     {
         const int x = blockDim.x * blockIdx.x + threadIdx.x;
         const int y = blockDim.y * blockIdx.y + threadIdx.y;
 
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
+        if (x < cols && y < rows && mask.ptr(y)[x / cn])
         {
             T* dst_row = (T*)dst.ptr(y);
             const T* src1_row = (const T*)src1.ptr(y);
@@ -1441,7 +1441,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T, int opid>
-    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
                         const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         dim3 threads(16, 16);
@@ -1450,12 +1450,12 @@ namespace cv { namespace gpu { namespace device
         bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
         cudaSafeCall( cudaGetLastError() );
 
-        if (stream == 0) 
+        if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
 
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1463,7 +1463,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1474,7 +1474,7 @@ namespace cv { namespace gpu { namespace device
     template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
 
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
                           const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1482,7 +1482,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
                               const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace device
     template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
 
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, 
+    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
                           const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1501,7 +1501,7 @@ namespace cv { namespace gpu { namespace device
 
 
     template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, 
+    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
                               const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
     {
         bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1546,7 +1546,7 @@ namespace cv { namespace gpu { namespace device
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);    
+        cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);
     }
 
     template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1560,7 +1560,7 @@ namespace cv { namespace gpu { namespace device
     template <typename T>
     void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);    
+        cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);
     }
 
     template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1574,7 +1574,7 @@ namespace cv { namespace gpu { namespace device
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);    
+        cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
     }
 
     template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1588,7 +1588,7 @@ namespace cv { namespace gpu { namespace device
     template <typename T>
     void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);    
+        cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
     }
 
     template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1647,12 +1647,12 @@ namespace cv { namespace gpu { namespace device
     {
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, cudaStream_t stream);
 
-        static const caller_t callers[] = 
+        static const caller_t callers[] =
         {
-            threshold_caller<thresh_binary_func, T>, 
-            threshold_caller<thresh_binary_inv_func, T>, 
-            threshold_caller<thresh_trunc_func, T>, 
-            threshold_caller<thresh_to_zero_func, T>, 
+            threshold_caller<thresh_binary_func, T>,
+            threshold_caller<thresh_binary_inv_func, T>,
+            threshold_caller<thresh_trunc_func, T>,
+            threshold_caller<thresh_to_zero_func, T>,
             threshold_caller<thresh_to_zero_inv_func, T>
         };
 
@@ -1671,14 +1671,14 @@ namespace cv { namespace gpu { namespace device
     // pow
 
     template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
-    {    
+    {
         float power;
         PowOp(float power_) : power(power_) {}
-            
+
         __device__ __forceinline__ T operator()(const T& e) const
-        {      
+        {
             return saturate_cast<T>(__powf((float)e, power));
-        }      
+        }
     };
 
     template<typename T> struct PowOp<T, true> : unary_function<T, T>
@@ -1688,11 +1688,11 @@ namespace cv { namespace gpu { namespace device
 
         __device__ __forceinline__ float operator()(const T& e) const
         {
-            T res = saturate_cast<T>(__powf((float)e, power));            
-            
+            T res = saturate_cast<T>(__powf((float)e, power));
+
             if ( (e < 0) && (1 & (int)power) )
-                    res *= -1;            
-            return res;         
+                    res *= -1;
+            return res;
         }
     };
 
@@ -1736,7 +1736,7 @@ namespace cv { namespace gpu { namespace device
     void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
     {
         cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), WithOutMask(), stream);
-    }   
+    }
 
     template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
     template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 35926f1..9abe173 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -71,8 +71,8 @@ void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&,
 void cv::gpu::bitwise_and(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_xor(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::rshift(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::lshift(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::rshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::lshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -101,11 +101,11 @@ namespace
     template <int DEPTH> struct NppArithmFunc
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-        
+
         typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
     };
     template <> struct NppArithmFunc<CV_32F>
-    {        
+    {
         typedef NppTypeTraits<CV_32F>::npp_t npp_t;
 
         typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
@@ -123,7 +123,7 @@ namespace
             sz.width  = src1.cols;
             sz.height = src1.rows;
 
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), 
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
                 (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
 
             if (stream == 0)
@@ -145,8 +145,8 @@ namespace
             NppiSize sz;
             sz.width  = src1.cols;
             sz.height = src1.rows;
- 
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), 
+
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
                 (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
 
             if (stream == 0)
@@ -162,12 +162,12 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 }}}
 
@@ -177,7 +177,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {add_gpu<unsigned char, unsigned char>, 0/*add_gpu<unsigned char, signed char>*/, add_gpu<unsigned char, unsigned short>, add_gpu<unsigned char, short>, add_gpu<unsigned char, int>, add_gpu<unsigned char, float>, add_gpu<unsigned char, double>},
         {0/*add_gpu<signed char, unsigned char>*/, 0/*add_gpu<signed char, signed char>*/, 0/*add_gpu<signed char, unsigned short>*/, 0/*add_gpu<signed char, short>*/, 0/*add_gpu<signed char, int>*/, 0/*add_gpu<signed char, float>*/, 0/*add_gpu<signed char, double>*/},
@@ -188,7 +188,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
         {0/*add_gpu<double, unsigned char>*/, 0/*add_gpu<double, signed char>*/, 0/*add_gpu<double, unsigned short>*/, 0/*add_gpu<double, short>*/, 0/*add_gpu<double, int>*/, 0/*add_gpu<double, float>*/, add_gpu<double, double>}
     };
 
-    static const func_t npp_funcs[7] = 
+    static const func_t npp_funcs[7] =
     {
         NppArithm<CV_8U, nppiAdd_8u_C1RSfs>::call,
         0,
@@ -228,21 +228,21 @@ namespace
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, 
+        typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants,
             npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
     };
     template<int DEPTH> struct NppArithmScalarFunc<DEPTH, 1>
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants, 
+        typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants,
             npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
     };
     template<int DEPTH> struct NppArithmScalarFunc<DEPTH, 2>
     {
         typedef typename NppTypeTraits<DEPTH>::npp_complex_type npp_complex_type;
 
-        typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants, 
+        typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants,
             npp_complex_type* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
     };
     template<int cn> struct NppArithmScalarFunc<CV_32F, cn>
@@ -313,7 +313,7 @@ namespace
             nConstant.re = saturate_cast<npp_t>(sc.val[0]);
             nConstant.im = saturate_cast<npp_t>(sc.val[1]);
 
-            nppSafeCall( func(src.ptr<npp_complex_type>(), static_cast<int>(src.step), nConstant, 
+            nppSafeCall( func(src.ptr<npp_complex_type>(), static_cast<int>(src.step), nConstant,
                          dst.ptr<npp_complex_type>(), static_cast<int>(dst.step), sz, 0) );
 
             if (stream == 0)
@@ -382,7 +382,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {add_gpu<unsigned char, unsigned char>, 0/*add_gpu<unsigned char, signed char>*/, add_gpu<unsigned char, unsigned short>, add_gpu<unsigned char, short>, add_gpu<unsigned char, int>, add_gpu<unsigned char, float>, add_gpu<unsigned char, double>},
         {0/*add_gpu<signed char, unsigned char>*/, 0/*add_gpu<signed char, signed char>*/, 0/*add_gpu<signed char, unsigned short>*/, 0/*add_gpu<signed char, short>*/, 0/*add_gpu<signed char, int>*/, 0/*add_gpu<signed char, float>*/, 0/*add_gpu<signed char, double>*/},
@@ -394,7 +394,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
     };
 
     typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] = 
+    static const npp_func_t npp_funcs[7][4] =
     {
         {NppArithmScalar<CV_8U, 1, nppiAddC_8u_C1RSfs>::call, 0, NppArithmScalar<CV_8U, 3, nppiAddC_8u_C3RSfs>::call, NppArithmScalar<CV_8U, 4, nppiAddC_8u_C4RSfs>::call},
         {0,0,0,0},
@@ -436,12 +436,12 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 ////////////////////////////////////////////////////////////////////////
 // subtract
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 }}}
 
@@ -451,7 +451,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {subtract_gpu<unsigned char, unsigned char>, 0/*subtract_gpu<unsigned char, signed char>*/, subtract_gpu<unsigned char, unsigned short>, subtract_gpu<unsigned char, short>, subtract_gpu<unsigned char, int>, subtract_gpu<unsigned char, float>, subtract_gpu<unsigned char, double>},
         {0/*subtract_gpu<signed char, unsigned char>*/, 0/*subtract_gpu<signed char, signed char>*/, 0/*subtract_gpu<signed char, unsigned short>*/, 0/*subtract_gpu<signed char, short>*/, 0/*subtract_gpu<signed char, int>*/, 0/*subtract_gpu<signed char, float>*/, 0/*subtract_gpu<signed char, double>*/},
@@ -462,15 +462,14 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
         {0/*subtract_gpu<double, unsigned char>*/, 0/*subtract_gpu<double, signed char>*/, 0/*subtract_gpu<double, unsigned short>*/, 0/*subtract_gpu<double, short>*/, 0/*subtract_gpu<double, int>*/, 0/*subtract_gpu<double, float>*/, subtract_gpu<double, double>}
     };
 
-    static const func_t npp_funcs[7] = 
+    static const func_t npp_funcs[6] =
     {
         NppArithm<CV_8U, nppiSub_8u_C1RSfs>::call,
         0,
         NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call,
         NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call,
         NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiSub_32f_C1R>::call,
-        subtract_gpu<double, double>
+        NppArithm<CV_32F, nppiSub_32f_C1R>::call
     };
 
     CV_Assert(src1.type() != CV_8S);
@@ -484,7 +483,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src1.type())
+    if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
     {
         npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), PtrStepb(), stream);
         return;
@@ -502,7 +501,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {subtract_gpu<unsigned char, unsigned char>, 0/*subtract_gpu<unsigned char, signed char>*/, subtract_gpu<unsigned char, unsigned short>, subtract_gpu<unsigned char, short>, subtract_gpu<unsigned char, int>, subtract_gpu<unsigned char, float>, subtract_gpu<unsigned char, double>},
         {0/*subtract_gpu<signed char, unsigned char>*/, 0/*subtract_gpu<signed char, signed char>*/, 0/*subtract_gpu<signed char, unsigned short>*/, 0/*subtract_gpu<signed char, short>*/, 0/*subtract_gpu<signed char, int>*/, 0/*subtract_gpu<signed char, float>*/, 0/*subtract_gpu<signed char, double>*/},
@@ -514,7 +513,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
     };
 
     typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] = 
+    static const npp_func_t npp_funcs[7][4] =
     {
         {NppArithmScalar<CV_8U, 1, nppiSubC_8u_C1RSfs>::call, 0, NppArithmScalar<CV_8U, 3, nppiSubC_8u_C3RSfs>::call, NppArithmScalar<CV_8U, 4, nppiSubC_8u_C4RSfs>::call},
         {0,0,0,0},
@@ -556,15 +555,15 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 ////////////////////////////////////////////////////////////////////////
 // multiply
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
     void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 }}}
 
@@ -574,7 +573,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {multiply_gpu<unsigned char, unsigned char>, 0/*multiply_gpu<unsigned char, signed char>*/, multiply_gpu<unsigned char, unsigned short>, multiply_gpu<unsigned char, short>, multiply_gpu<unsigned char, int>, multiply_gpu<unsigned char, float>, multiply_gpu<unsigned char, double>},
         {0/*multiply_gpu<signed char, unsigned char>*/, 0/*multiply_gpu<signed char, signed char>*/, 0/*multiply_gpu<signed char, unsigned short>*/, 0/*multiply_gpu<signed char, short>*/, 0/*multiply_gpu<signed char, int>*/, 0/*multiply_gpu<signed char, float>*/, 0/*multiply_gpu<signed char, double>*/},
@@ -585,7 +584,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
         {0/*multiply_gpu<double, unsigned char>*/, 0/*multiply_gpu<double, signed char>*/, 0/*multiply_gpu<double, unsigned short>*/, 0/*multiply_gpu<double, short>*/, 0/*multiply_gpu<double, int>*/, 0/*multiply_gpu<double, float>*/, multiply_gpu<double, double>}
     };
 
-    static const func_t npp_funcs[7] = 
+    static const func_t npp_funcs[7] =
     {
         NppArithm<CV_8U, nppiMul_8u_C1RSfs>::call,
         0,
@@ -651,7 +650,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {multiply_gpu<unsigned char, unsigned char>, 0/*multiply_gpu<unsigned char, signed char>*/, multiply_gpu<unsigned char, unsigned short>, multiply_gpu<unsigned char, short>, multiply_gpu<unsigned char, int>, multiply_gpu<unsigned char, float>, multiply_gpu<unsigned char, double>},
         {0/*multiply_gpu<signed char, unsigned char>*/, 0/*multiply_gpu<signed char, signed char>*/, 0/*multiply_gpu<signed char, unsigned short>*/, 0/*multiply_gpu<signed char, short>*/, 0/*multiply_gpu<signed char, int>*/, 0/*multiply_gpu<signed char, float>*/, 0/*multiply_gpu<signed char, double>*/},
@@ -663,7 +662,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
     };
 
     typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] = 
+    static const npp_func_t npp_funcs[7][4] =
     {
         {NppArithmScalar<CV_8U, 1, nppiMulC_8u_C1RSfs>::call, 0, NppArithmScalar<CV_8U, 3, nppiMulC_8u_C3RSfs>::call, NppArithmScalar<CV_8U, 4, nppiMulC_8u_C4RSfs>::call},
         {0,0,0,0},
@@ -702,18 +701,18 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
 ////////////////////////////////////////////////////////////////////////
 // divide
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
     void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    template <typename T, typename D> 
+    template <typename T, typename D>
     void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 }}}
 
@@ -723,7 +722,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {divide_gpu<unsigned char, unsigned char>, 0/*divide_gpu<unsigned char, signed char>*/, divide_gpu<unsigned char, unsigned short>, divide_gpu<unsigned char, short>, divide_gpu<unsigned char, int>, divide_gpu<unsigned char, float>, divide_gpu<unsigned char, double>},
         {0/*divide_gpu<signed char, unsigned char>*/, 0/*divide_gpu<signed char, signed char>*/, 0/*divide_gpu<signed char, unsigned short>*/, 0/*divide_gpu<signed char, short>*/, 0/*divide_gpu<signed char, int>*/, 0/*divide_gpu<signed char, float>*/, 0/*divide_gpu<signed char, double>*/},
@@ -734,15 +733,14 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
         {0/*divide_gpu<double, unsigned char>*/, 0/*divide_gpu<double, signed char>*/, 0/*divide_gpu<double, unsigned short>*/, 0/*divide_gpu<double, short>*/, 0/*divide_gpu<double, int>*/, 0/*divide_gpu<double, float>*/, divide_gpu<double, double>}
     };
 
-    static const func_t npp_funcs[7] = 
+    static const func_t npp_funcs[6] =
     {
         NppArithm<CV_8U, nppiDiv_8u_C1RSfs>::call,
         0,
         NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call,
         NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call,
         NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiDiv_32f_C1R>::call,
-        divide_gpu<double, double>
+        NppArithm<CV_32F, nppiDiv_32f_C1R>::call
     };
 
     cudaStream_t stream = StreamAccessor::getStream(s);
@@ -753,7 +751,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<DevMem2D_<uchar4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<uchar4> >(dst), stream);
+        divide_gpu(static_cast<DevMem2D_<uchar4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<uchar4> >(dst), stream);
     }
     else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
     {
@@ -761,10 +759,10 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<DevMem2D_<short4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<short4> >(dst), stream);
+        divide_gpu(static_cast<DevMem2D_<short4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<short4> >(dst), stream);
     }
     else
-    {        
+    {
         CV_Assert(src1.type() != CV_8S);
         CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
 
@@ -773,7 +771,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
 
         dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
 
-        if (scale == 1 && dst.type() == src1.type())
+        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
         {
             npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), 1, stream);
             return;
@@ -792,7 +790,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {divide_gpu<unsigned char, unsigned char>, 0/*divide_gpu<unsigned char, signed char>*/, divide_gpu<unsigned char, unsigned short>, divide_gpu<unsigned char, short>, divide_gpu<unsigned char, int>, divide_gpu<unsigned char, float>, divide_gpu<unsigned char, double>},
         {0/*divide_gpu<signed char, unsigned char>*/, 0/*divide_gpu<signed char, signed char>*/, 0/*divide_gpu<signed char, unsigned short>*/, 0/*divide_gpu<signed char, short>*/, 0/*divide_gpu<signed char, int>*/, 0/*divide_gpu<signed char, float>*/, 0/*divide_gpu<signed char, double>*/},
@@ -804,7 +802,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     };
 
     typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] = 
+    static const npp_func_t npp_funcs[7][4] =
     {
         {NppArithmScalar<CV_8U, 1, nppiDivC_8u_C1RSfs>::call, 0, NppArithmScalar<CV_8U, 3, nppiDivC_8u_C3RSfs>::call, NppArithmScalar<CV_8U, 4, nppiDivC_8u_C4RSfs>::call},
         {0,0,0,0},
@@ -846,7 +844,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 
     typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-    static const func_t funcs[7][7] = 
+    static const func_t funcs[7][7] =
     {
         {divide_gpu<unsigned char, unsigned char>, 0/*divide_gpu<unsigned char, signed char>*/, divide_gpu<unsigned char, unsigned short>, divide_gpu<unsigned char, short>, divide_gpu<unsigned char, int>, divide_gpu<unsigned char, float>, divide_gpu<unsigned char, double>},
         {0/*divide_gpu<signed char, unsigned char>*/, 0/*divide_gpu<signed char, signed char>*/, 0/*divide_gpu<signed char, unsigned short>*/, 0/*divide_gpu<signed char, short>*/, 0/*divide_gpu<signed char, int>*/, 0/*divide_gpu<signed char, float>*/, 0/*divide_gpu<signed char, double>*/},
@@ -875,12 +873,12 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 //////////////////////////////////////////////////////////////////////////////
 // absdiff
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-    template <typename T> 
+    template <typename T>
     void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
 }}}
 
@@ -890,7 +888,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
        absdiff_gpu<unsigned char>, absdiff_gpu<signed char>, absdiff_gpu<unsigned short>, absdiff_gpu<short>, absdiff_gpu<int>, absdiff_gpu<float>, absdiff_gpu<double>
     };
@@ -909,7 +907,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
     {
         NppStreamHandler h(stream);
 
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step),
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
 
         if (stream == 0)
@@ -919,7 +917,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
     {
         NppStreamHandler h(stream);
 
-        nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step), 
+        nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step),
             dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
 
         if (stream == 0)
@@ -929,7 +927,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
     {
         NppStreamHandler h(stream);
 
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step),
             dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
 
         if (stream == 0)
@@ -969,7 +967,7 @@ namespace
             sz.width  = src1.cols;
             sz.height = src1.rows;
 
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (npp_t*)dst.data, static_cast<int>(dst.step), 
+            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (npp_t*)dst.data, static_cast<int>(dst.step),
                 sz, static_cast<npp_t>(val)) );
 
             if (stream == 0)
@@ -984,14 +982,14 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 
     typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
-        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call, 
-        absdiff_gpu<signed char>, 
-        NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call, 
+        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
+        absdiff_gpu<signed char>,
+        NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call,
         absdiff_gpu<short>,
-        absdiff_gpu<int>, 
-        NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call, 
+        absdiff_gpu<int>,
+        NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call,
         absdiff_gpu<double>
     };
 
@@ -1132,7 +1130,7 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
         NppSqr<CV_8U, nppiSqr_8u_C1RSfs, nppiSqr_8u_C4RSfs>::call,
         0,
@@ -1209,7 +1207,7 @@ void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
         NppOneSource<CV_8U, nppiSqrt_8u_C1RSfs>::call,
         0,
@@ -1233,7 +1231,7 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
         NppOneSource<CV_8U, nppiLn_8u_C1RSfs>::call,
         0,
@@ -1257,7 +1255,7 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
         NppOneSource<CV_8U, nppiExp_8u_C1RSfs>::call,
         0,
@@ -1277,7 +1275,7 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 //////////////////////////////////////////////////////////////////////////////
 // Comparison of two matrixes
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
     template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1291,7 +1289,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 
     typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
 
-    static const func_t funcs[7][4] = 
+    static const func_t funcs[7][4] =
     {
         {compare_eq<unsigned char>, compare_ne<unsigned char>, compare_lt<unsigned char>, compare_le<unsigned char>},
         {compare_eq<signed char>, compare_ne<signed char>, compare_lt<signed char>, compare_le<signed char>},
@@ -1353,7 +1351,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
 
@@ -1377,9 +1375,9 @@ namespace
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
-        static Caller callers[] = 
+        static Caller callers[] =
         {
-            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>, 
+            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
             bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
             bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
             bitwiseMaskNotCaller<unsigned int>
@@ -1410,7 +1408,7 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
 
@@ -1444,9 +1442,9 @@ namespace
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
-        static Caller callers[] = 
+        static Caller callers[] =
         {
-            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>, 
+            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
             bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
             bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
             bitwiseMaskOrCaller<unsigned int>
@@ -1478,9 +1476,9 @@ namespace
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
-        static Caller callers[] = 
+        static Caller callers[] =
         {
-            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>, 
+            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
             bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
             bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
             bitwiseMaskAndCaller<unsigned int>
@@ -1512,9 +1510,9 @@ namespace
 
         typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
 
-        static Caller callers[] = 
+        static Caller callers[] =
         {
-            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>, 
+            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
             bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
             bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
             bitwiseMaskXorCaller<unsigned int>
@@ -1584,7 +1582,7 @@ namespace
 
             const npp_t pConstants[] = {static_cast<npp_t>(sc.val[0]), static_cast<npp_t>(sc.val[1]), static_cast<npp_t>(sc.val[2]), static_cast<npp_t>(sc.val[3])};
 
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );            
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -1602,7 +1600,7 @@ namespace
             oSizeROI.width = src.cols;
             oSizeROI.height = src.rows;
 
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), static_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );            
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), static_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -1614,7 +1612,7 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
 {
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[5][4] = 
+    static const func_t funcs[5][4] =
     {
         {NppBitwiseC<CV_8U, 1, nppiOrC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiOrC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiOrC_8u_C4R>::call},
         {0,0,0,0},
@@ -1635,7 +1633,7 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
 {
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[5][4] = 
+    static const func_t funcs[5][4] =
     {
         {NppBitwiseC<CV_8U, 1, nppiAndC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiAndC_8u_C4R>::call},
         {0,0,0,0},
@@ -1656,7 +1654,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
 {
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
 
-    static const func_t funcs[5][4] = 
+    static const func_t funcs[5][4] =
     {
         {NppBitwiseC<CV_8U, 1, nppiXorC_8u_C1R>::call, 0, NppBitwiseC<CV_8U, 3, nppiXorC_8u_C3R>::call, NppBitwiseC<CV_8U, 4, nppiXorC_8u_C4R>::call},
         {0,0,0,0},
@@ -1704,7 +1702,7 @@ namespace
             oSizeROI.height = src.rows;
 
             nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            
+
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
@@ -1722,17 +1720,17 @@ namespace
             oSizeROI.height = src.rows;
 
             nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            
+
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
 }
 
-void cv::gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] = 
+    static const func_t funcs[5][4] =
     {
         {NppShift<CV_8U , 1, nppiRShiftC_8u_C1R >::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R >::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R>::call },
         {NppShift<CV_8S , 1, nppiRShiftC_8s_C1R >::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R >::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R>::call },
@@ -1749,10 +1747,10 @@ void cv::gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& s
     funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] = 
+    static const func_t funcs[5][4] =
     {
         {NppShift<CV_8U , 1, nppiLShiftC_8u_C1R>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R>::call },
         {0                                             , 0, 0                                             , 0                                             },
@@ -1772,7 +1770,7 @@ void cv::gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& s
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
@@ -1803,7 +1801,7 @@ namespace
         dst.create(src1.size(), src1.type());
         ::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
     }
-    
+
     template <typename T>
     void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
     {
@@ -1820,58 +1818,58 @@ namespace
     }
 }
 
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) 
-{ 
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
     CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert((src1.depth() != CV_64F) || 
+    CV_Assert((src1.depth() != CV_64F) ||
         (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
 
     typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>, 
+        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
         min_caller<float>, min_caller<double>
     };
     funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream) 
+void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 {
-    CV_Assert((src1.depth() != CV_64F) || 
+    CV_Assert((src1.depth() != CV_64F) ||
         (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
 
     typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
-        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>, 
+        min_caller<unsigned char>, min_caller<signed char>, min_caller<unsigned short>, min_caller<short>, min_caller<int>,
         min_caller<float>, min_caller<double>
     };
     funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) 
-{ 
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
     CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert((src1.depth() != CV_64F) || 
+    CV_Assert((src1.depth() != CV_64F) ||
         (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
 
     typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>, 
+        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
         max_caller<float>, max_caller<double>
     };
     funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream) 
+void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 {
-    CV_Assert((src1.depth() != CV_64F) || 
+    CV_Assert((src1.depth() != CV_64F) ||
         (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
 
     typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
-        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>, 
+        max_caller<unsigned char>, max_caller<signed char>, max_caller<unsigned short>, max_caller<short>, max_caller<int>,
         max_caller<float>, max_caller<double>
     };
     funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
@@ -1880,7 +1878,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // threshold
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
@@ -1921,10 +1919,10 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
     {
         typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
 
-        static const caller_t callers[] = 
+        static const caller_t callers[] =
         {
-            threshold_caller<unsigned char>, threshold_caller<signed char>, 
-            threshold_caller<unsigned short>, threshold_caller<short>, 
+            threshold_caller<unsigned char>, threshold_caller<signed char>,
+            threshold_caller<unsigned short>, threshold_caller<short>,
             threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
         };
 
@@ -1943,7 +1941,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template<typename T>
     void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
@@ -1958,10 +1956,10 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 
     typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
 
-    static const caller_t callers[] = 
+    static const caller_t callers[] =
     {
-        pow_caller<unsigned char>,  pow_caller<signed char>, 
-        pow_caller<unsigned short>, pow_caller<short>, 
+        pow_caller<unsigned char>,  pow_caller<signed char>,
+        pow_caller<unsigned short>, pow_caller<short>,
         pow_caller<int>, pow_caller<float>
     };
 
@@ -1992,7 +1990,7 @@ namespace
             oSizeROI.width = img1.cols;
             oSizeROI.height = img2.rows;
 
-            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step), 
+            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
                               dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
 
             if (stream == 0)
@@ -2021,7 +2019,7 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
 
     typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
 
-    static const func_t funcs[] = 
+    static const func_t funcs[] =
     {
         NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
         0,
@@ -2046,7 +2044,7 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
 ////////////////////////////////////////////////////////////////////////
 // addWeighted
 
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
     template <typename T1, typename T2, typename D>
     void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
diff --git a/modules/gpu/test/test_arithm.cpp b/modules/gpu/test/test_arithm.cpp
index b8fed5e..0bf806b 100644
--- a/modules/gpu/test/test_arithm.cpp
+++ b/modules/gpu/test/test_arithm.cpp
@@ -43,408 +43,1493 @@
 
 #ifdef HAVE_CUDA
 
-using namespace cvtest;
-using namespace testing;
+////////////////////////////////////////////////////////////////////////////////
+// Add_Array
+
+PARAM_TEST_CASE(Add_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+TEST_P(Add_Array, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+
+    cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+    cv::add(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Add_Scalar
+
+PARAM_TEST_CASE(Add_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Add_Scalar, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::add(loadMat(mat, useRoi), val, dst, loadMat(mask, useRoi), depth.second);
+
+    cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+    cv::add(mat, val, dst_gold, mask, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Array
+
+PARAM_TEST_CASE(Subtract_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+TEST_P(Subtract_Array, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+
+    cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+    cv::subtract(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Scalar
+
+PARAM_TEST_CASE(Subtract_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Subtract_Scalar, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::subtract(loadMat(mat, useRoi), val, dst, loadMat(mask, useRoi), depth.second);
+
+    cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+    cv::subtract(mat, val, dst_gold, mask, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Array
+
+PARAM_TEST_CASE(Multiply_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+TEST_P(Multiply_Array, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    double scale = randomDouble(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+    cv::gpu::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, scale, depth.second);
+
+    cv::Mat dst_gold;
+    cv::multiply(mat1, mat2, dst_gold, scale, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Multiply_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Array_Special_Case
+
+PARAM_TEST_CASE(Multiply_Array_Special_Case, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Multiply_Array_Special_Case, _8UC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_8UC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1);
+
+    cv::gpu::GpuMat dst = createMat(size, CV_8UC4, useRoi);
+    cv::gpu::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4b* mat1_row = mat1.ptr<cv::Vec4b>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4b* dst_row = h_dst.ptr<cv::Vec4b>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4b val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4b actual = dst_row[x];
+
+            cv::Vec4b gold;
+
+            gold[0] = cv::saturate_cast<uchar>(val1[0] * val2);
+            gold[1] = cv::saturate_cast<uchar>(val1[1] * val2);
+            gold[2] = cv::saturate_cast<uchar>(val1[2] * val2);
+            gold[3] = cv::saturate_cast<uchar>(val1[3] * val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+TEST_P(Multiply_Array_Special_Case, _16SC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_16SC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1);
+
+    cv::gpu::GpuMat dst = createMat(size, CV_16SC4, useRoi);
+    cv::gpu::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4s* mat1_row = mat1.ptr<cv::Vec4s>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4s* dst_row = h_dst.ptr<cv::Vec4s>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4s val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4s actual = dst_row[x];
+
+            cv::Vec4s gold;
+
+            gold[0] = cv::saturate_cast<short>(val1[0] * val2);
+            gold[1] = cv::saturate_cast<short>(val1[1] * val2);
+            gold[2] = cv::saturate_cast<short>(val1[2] * val2);
+            gold[3] = cv::saturate_cast<short>(val1[3] * val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Multiply_Array_Special_Case, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Scalar
+
+PARAM_TEST_CASE(Multiply_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Multiply_Scalar, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    double scale = randomDouble(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+    cv::gpu::multiply(loadMat(mat, useRoi), val, dst, scale, depth.second);
+
+    cv::Mat dst_gold;
+    cv::multiply(mat, val, dst_gold, scale, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Multiply_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Array
+
+PARAM_TEST_CASE(Divide_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+TEST_P(Divide_Array, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype, 1.0, 255.0);
+    double scale = randomDouble(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+    cv::gpu::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, scale, depth.second);
+
+    cv::Mat dst_gold;
+    cv::divide(mat1, mat2, dst_gold, scale, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Divide_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Array_Special_Case
+
+PARAM_TEST_CASE(Divide_Array_Special_Case, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Divide_Array_Special_Case, _8UC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_8UC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, CV_8UC4, useRoi);
+    cv::gpu::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4b* mat1_row = mat1.ptr<cv::Vec4b>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4b* dst_row = h_dst.ptr<cv::Vec4b>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4b val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4b actual = dst_row[x];
+
+            cv::Vec4b gold;
+
+            gold[0] = cv::saturate_cast<uchar>(val1[0] / val2);
+            gold[1] = cv::saturate_cast<uchar>(val1[1] / val2);
+            gold[2] = cv::saturate_cast<uchar>(val1[2] / val2);
+            gold[3] = cv::saturate_cast<uchar>(val1[3] / val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+TEST_P(Divide_Array_Special_Case, _16SC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_16SC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, CV_16SC4, useRoi);
+    cv::gpu::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4s* mat1_row = mat1.ptr<cv::Vec4s>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4s* dst_row = h_dst.ptr<cv::Vec4s>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4s val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4s actual = dst_row[x];
+
+            cv::Vec4s gold;
+
+            gold[0] = cv::saturate_cast<short>(val1[0] / val2);
+            gold[1] = cv::saturate_cast<short>(val1[1] / val2);
+            gold[2] = cv::saturate_cast<short>(val1[2] / val2);
+            gold[3] = cv::saturate_cast<short>(val1[3] / val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Divide_Array_Special_Case, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Scalar
+
+PARAM_TEST_CASE(Divide_Scalar, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Divide_Scalar, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(1.0, 255.0);
+    double scale = randomDouble(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+    cv::gpu::divide(loadMat(mat, useRoi), val, dst, scale, depth.second);
+
+    cv::Mat dst_gold;
+    cv::divide(mat, val, dst_gold, scale, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
 
-PARAM_TEST_CASE(ArithmTestBase, cv::gpu::DeviceInfo, MatType, UseRoi)
+INSTANTIATE_TEST_CASE_P(GPU_Core, Divide_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Scalar_Inv
+
+PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatType, MatType> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Divide_Scalar_Inv, Accuracy)
+{
+    if (depth.first == CV_64F || depth.second == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    double scale = randomDouble(0.0, 255.0);
+    cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+    cv::gpu::divide(scale, loadMat(mat, useRoi), dst, depth.second);
+
+    cv::Mat dst_gold;
+    cv::divide(scale, mat, dst_gold, depth.second);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Divide_Scalar_Inv, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// AbsDiff
+
+PARAM_TEST_CASE(AbsDiff, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(AbsDiff, Array)
+{
+    if (depth == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    cv::gpu::absdiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::absdiff(src1, src2, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST_P(AbsDiff, Scalar)
+{
+    if (depth == CV_64F)
+    {
+        if (!devInfo.supports(cv::gpu::NATIVE_DOUBLE))
+            return;
+    }
+
+    cv::Mat src = randomMat(size, depth);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    cv::gpu::absdiff(loadMat(src, useRoi), val, dst);
+
+    cv::Mat dst_gold;
+    cv::absdiff(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, AbsDiff, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Abs
+
+PARAM_TEST_CASE(Abs, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Abs, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::abs(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold = cv::abs(src);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_16SC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqr
+
+PARAM_TEST_CASE(Sqr, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Sqr, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::sqr(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::multiply(src, src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqrt
+
+namespace
+{
+    template <typename T> void sqrtImpl(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::sqrt(static_cast<float>(src.at<T>(y, x))));
+        }
+    }
+
+    void sqrtGold(const cv::Mat& src, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            sqrtImpl<uchar>, sqrtImpl<schar>, sqrtImpl<ushort>, sqrtImpl<short>,
+            sqrtImpl<int>, sqrtImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
+}
+
+PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Sqrt, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::sqrt(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    sqrtGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Log
+
+namespace
+{
+    template <typename T> void logImpl(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::log(static_cast<float>(src.at<T>(y, x))));
+        }
+    }
+
+    void logGold(const cv::Mat& src, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            logImpl<uchar>, logImpl<schar>, logImpl<ushort>, logImpl<short>,
+            logImpl<int>, logImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
+}
+
+PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Log, Accuracy)
+{
+    cv::Mat src = randomMat(size, type, 1.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::log(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    logGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Exp
+
+PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Exp, Accuracy)
+{
+    cv::Mat src = randomMat(size, type, 0.0, 10.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::exp(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::exp(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// compare
+
+PARAM_TEST_CASE(Compare, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Compare, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst = createMat(size, CV_8UC1, useRoi);
+    cv::gpu::compare(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, cmp_code);
+
+    cv::Mat dst_gold;
+    cv::compare(src1, src2, dst_gold, cmp_code);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Bitwise_Array
+
+PARAM_TEST_CASE(Bitwise_Array, cv::gpu::DeviceInfo, cv::Size, MatType)
 {
     cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
     int type;
-    bool useRoi;
 
-    cv::Size size;
-    cv::Mat mat1; 
-    cv::Mat mat2;
-    cv::Scalar val;
-        
+    cv::Mat src1;
+    cv::Mat src2;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
 
         cv::gpu::setDevice(devInfo.deviceID());
 
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-
-        val = cv::Scalar(rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3));
+        src1 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
+        src2 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
     }
 };
 
-////////////////////////////////////////////////////////////////////////////////
-// add
-
-struct Add : ArithmTestBase {};
-
-TEST_P(Add, Array) 
-{    
-    cv::Mat dst_gold;
-    cv::add(mat1, mat2, dst_gold);
+TEST_P(Bitwise_Array, Not)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_not(loadMat(src1), dst);
 
-    cv::Mat dst;
+    cv::Mat dst_gold = ~src1;
 
-    cv::gpu::GpuMat gpuRes;
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
 
-    cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes);
+TEST_P(Bitwise_Array, Or)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_or(loadMat(src1), loadMat(src2), dst);
 
-    gpuRes.download(dst);
+    cv::Mat dst_gold = src1 | src2;
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Add, Scalar) 
-{    
-    cv::Mat dst_gold;
-    cv::add(mat1, val, dst_gold);
+TEST_P(Bitwise_Array, And)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_and(loadMat(src1), loadMat(src2), dst);
 
-    cv::Mat dst;
+    cv::Mat dst_gold = src1 & src2;
 
-    cv::gpu::GpuMat gpuRes;
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
 
-    cv::gpu::add(loadMat(mat1, useRoi), val, gpuRes);
+TEST_P(Bitwise_Array, Xor)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_xor(loadMat(src1), loadMat(src2), dst);
 
-    gpuRes.download(dst);
+    cv::Mat dst_gold = src1 ^ src2;
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
-                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
+INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_32S, 1, 4)));
 
-////////////////////////////////////////////////////////////////////////////////
-// subtract
+//////////////////////////////////////////////////////////////////////////////
+// Bitwise_Scalar
 
-struct Subtract : ArithmTestBase {};
+PARAM_TEST_CASE(Bitwise_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, int)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
 
-TEST_P(Subtract, Array) 
-{    
-    cv::Mat dst_gold;
-    cv::subtract(mat1, mat2, dst_gold);
+    cv::Mat src;
+    cv::Scalar val;
 
-    cv::Mat dst;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
 
-    cv::gpu::GpuMat gpuRes;
+        cv::gpu::setDevice(devInfo.deviceID());
 
-    cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes);
+        src = randomMat(size, CV_MAKE_TYPE(depth, channels));
+        cv::Scalar_<int> ival = randomScalar(0.0, 255.0);
+        val = ival;
+    }
+};
 
-    gpuRes.download(dst);
+TEST_P(Bitwise_Scalar, Or)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_or(loadMat(src), val, dst);
+
+    cv::Mat dst_gold;
+    cv::bitwise_or(src, val, dst_gold);
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Subtract, Scalar) 
-{    
-    cv::Mat dst_gold;
-    cv::subtract(mat1, val, dst_gold);
+TEST_P(Bitwise_Scalar, And)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_and(loadMat(src), val, dst);
 
-    cv::Mat dst;
+    cv::Mat dst_gold;
+    cv::bitwise_and(src, val, dst_gold);
 
-    cv::gpu::GpuMat gpuRes;
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
 
-    cv::gpu::subtract(loadMat(mat1, useRoi), val, gpuRes);
+TEST_P(Bitwise_Scalar, Xor)
+{
+    cv::gpu::GpuMat dst;
+    cv::gpu::bitwise_xor(loadMat(src), val, dst);
 
-    gpuRes.download(dst);
+    cv::Mat dst_gold;
+    cv::bitwise_xor(src, val, dst_gold);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Subtract, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
-                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
-// multiply
+INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
+    testing::Values(1, 3, 4)));
 
-struct Multiply : ArithmTestBase {};
+//////////////////////////////////////////////////////////////////////////////
+// RShift
 
-TEST_P(Multiply, Array) 
-{    
-    cv::Mat dst_gold;
-    cv::multiply(mat1, mat2, dst_gold);
+namespace
+{
+    template <typename T> void rhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        const int cn = src.channels();
 
-    cv::Mat dst;
+        dst.create(src.size(), src.type());
 
-    cv::gpu::GpuMat gpuRes;
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) >> val.val[c];
+            }
+        }
+    }
 
-    cv::gpu::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes);
+    void rhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
 
-    gpuRes.download(dst);
+        const func_t funcs[] =
+        {
+            rhiftImpl<uchar>, rhiftImpl<schar>, rhiftImpl<ushort>, rhiftImpl<short>, rhiftImpl<int>
+        };
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        funcs[src.depth()](src, val, dst);
+    }
 }
 
-TEST_P(Multiply, Scalar) 
-{    
-    cv::Mat dst_gold;
-    cv::multiply(mat1, val, dst_gold);
+PARAM_TEST_CASE(RShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
 
-    cv::Mat dst;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
 
-    cv::gpu::GpuMat gpuRes;
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(RShift, Accuracy)
+{
+    int type = CV_MAKE_TYPE(depth, channels);
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar_<int> val = randomScalar(0.0, 8.0);
 
-    cv::gpu::multiply(loadMat(mat1, useRoi), val, gpuRes);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::rshift(loadMat(src, useRoi), val, dst);
 
-    gpuRes.download(dst);
+    cv::Mat dst_gold;
+    rhiftGold(src, val, dst_gold);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Multiply, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
-                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
-// divide
+INSTANTIATE_TEST_CASE_P(GPU_Core, RShift, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S)),
+    testing::Values(1, 3, 4),
+    WHOLE_SUBMAT));
 
-struct Divide : ArithmTestBase {};
+//////////////////////////////////////////////////////////////////////////////
+// LShift
 
-TEST_P(Divide, Array) 
-{    
-    cv::Mat dst_gold;
-    cv::divide(mat1, mat2, dst_gold);
+namespace
+{
+    template <typename T> void lhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        const int cn = src.channels();
 
-    cv::Mat dst;
+        dst.create(src.size(), src.type());
 
-    cv::gpu::GpuMat gpuRes;
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) << val.val[c];
+            }
+        }
+    }
 
-    cv::gpu::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes);
+    void lhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
 
-    gpuRes.download(dst);
+        const func_t funcs[] =
+        {
+            lhiftImpl<uchar>, lhiftImpl<schar>, lhiftImpl<ushort>, lhiftImpl<short>, lhiftImpl<int>
+        };
 
-    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
+        funcs[src.depth()](src, val, dst);
+    }
 }
 
-TEST_P(Divide, Scalar) 
-{    
-    cv::Mat dst_gold;
-    cv::divide(mat1, val, dst_gold);
+PARAM_TEST_CASE(LShift, cv::gpu::DeviceInfo, cv::Size, MatDepth, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
 
-    cv::Mat dst;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
 
-    cv::gpu::GpuMat gpuRes;
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
 
-    cv::gpu::divide(loadMat(mat1, useRoi), val, gpuRes);
+TEST_P(LShift, Accuracy)
+{
+    int type = CV_MAKE_TYPE(depth, channels);
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar_<int> val = randomScalar(0.0, 8.0);
 
-    gpuRes.download(dst);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::rshift(loadMat(src, useRoi), val, dst);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
-}
+    cv::Mat dst_gold;
+    rhiftGold(src, val, dst_gold);
 
-INSTANTIATE_TEST_CASE_P(Arithm, Divide, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
-                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
-                        WHOLE_SUBMAT));
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
 
-////////////////////////////////////////////////////////////////////////////////
-// transpose
+INSTANTIATE_TEST_CASE_P(GPU_Core, LShift, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),  MatDepth(CV_16U), MatDepth(CV_32S)),
+    testing::Values(1, 3, 4),
+    WHOLE_SUBMAT));
 
-struct Transpose : ArithmTestBase {};
+//////////////////////////////////////////////////////////////////////////////
+// Min
 
-TEST_P(Transpose, Accuracy) 
+PARAM_TEST_CASE(Min, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
-    cv::Mat dst_gold;
-    cv::transpose(mat1, dst_gold);
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
 
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat gpuRes;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
 
-    cv::gpu::transpose(loadMat(mat1, useRoi), gpuRes);
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
 
-    gpuRes.download(dst);
+TEST_P(Min, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    cv::gpu::min(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+
+    cv::Mat dst_gold = cv::min(src1, src2);
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_32FC1, CV_32FC2, CV_64FC1),
-                        WHOLE_SUBMAT));
+INSTANTIATE_TEST_CASE_P(GPU_Core, Min, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// absdiff
+//////////////////////////////////////////////////////////////////////////////
+// Max
 
-struct Absdiff : ArithmTestBase {};
+PARAM_TEST_CASE(Max, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
 
-TEST_P(Absdiff, Array) 
-{    
-    cv::Mat dst_gold;
-    cv::absdiff(mat1, mat2, dst_gold);
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
 
-    cv::Mat dst;
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
 
-    cv::gpu::GpuMat gpuRes;
+TEST_P(Max, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
 
-    cv::gpu::absdiff(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+    cv::gpu::max(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
 
-    gpuRes.download(dst);
+    cv::Mat dst_gold = cv::max(src1, src2);
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-TEST_P(Absdiff, Scalar) 
-{    
-    cv::Mat dst_gold;
-    cv::absdiff(mat1, val, dst_gold);
-
-    cv::Mat dst;
-
-    cv::gpu::GpuMat gpuRes;
-
-    cv::gpu::absdiff(loadMat(mat1, useRoi), val, gpuRes);
+INSTANTIATE_TEST_CASE_P(GPU_Core, Max, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
 
-    gpuRes.download(dst);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
-                        WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// abs
 
-struct Abs : ArithmTestBase {};
 
-TEST_P(Abs, Array) 
-{
-    cv::Mat dst_gold = cv::abs(mat1);
 
-    cv::Mat dst;
 
-    cv::gpu::GpuMat gpuRes;
 
-    cv::gpu::abs(loadMat(mat1, useRoi), gpuRes);
 
-    gpuRes.download(dst);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine(
-                        ALL_DEVICES,
-                        Values(CV_16SC1, CV_32FC1),
-                        WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// Sqr
 
-struct Sqr : ArithmTestBase {};
 
-TEST_P(Sqr, Array) 
-{
-    cv::Mat dst_gold;
-    cv::multiply(mat1, mat1, dst_gold);
 
-    cv::Mat dst;
 
-    cv::gpu::GpuMat gpuRes;
 
-    cv::gpu::sqr(loadMat(mat1, useRoi), gpuRes);
 
-    gpuRes.download(dst);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Sqr, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                        WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// Sqrt
 
-struct Sqrt : ArithmTestBase {};
 
-TEST_P(Sqrt, Array) 
-{
-    cv::Mat dst_gold;
-    cv::sqrt(mat1, dst_gold);
 
-    cv::Mat dst;
 
-    cv::gpu::GpuMat gpuRes;
 
-    cv::gpu::sqrt(loadMat(mat1, useRoi), gpuRes);
 
-    gpuRes.download(dst);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
-}
 
-INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(
-                        ALL_DEVICES,
-                        Values(MatType(CV_32FC1)),
-                        WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// compare
+using namespace cvtest;
+using namespace testing;
 
-PARAM_TEST_CASE(Compare, cv::gpu::DeviceInfo, MatType, CmpCode, UseRoi)
+PARAM_TEST_CASE(ArithmTestBase, cv::gpu::DeviceInfo, MatType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     int type;
-    int cmp_code;
     bool useRoi;
 
     cv::Size size;
-    cv::Mat mat1, mat2;
+    cv::Mat mat1;
+    cv::Mat mat2;
+    cv::Scalar val;
 
-    cv::Mat dst_gold;
-        
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
-        cmp_code = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
+        useRoi = GET_PARAM(2);
 
         cv::gpu::setDevice(devInfo.deviceID());
 
         cv::RNG& rng = TS::ptr()->get_rng();
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
-        mat1 = randomMat(rng, size, type, 1, 16, false);
-        mat2 = randomMat(rng, size, type, 1, 16, false);
 
-        cv::compare(mat1, mat2, dst_gold, cmp_code);
+        mat1 = randomMat(rng, size, type, 5, 16, false);
+        mat2 = randomMat(rng, size, type, 5, 16, false);
+
+        val = cv::Scalar(rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3));
     }
 };
 
-TEST_P(Compare, Accuracy) 
+////////////////////////////////////////////////////////////////////////////////
+// transpose
+
+struct Transpose : ArithmTestBase {};
+
+TEST_P(Transpose, Accuracy)
 {
+    cv::Mat dst_gold;
+    cv::transpose(mat1, dst_gold);
+
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat gpuRes;
 
-    cv::gpu::compare(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpuRes, cmp_code);
+    cv::gpu::transpose(loadMat(mat1, useRoi), gpuRes);
 
     gpuRes.download(dst);
 
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(
+INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
                         ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1),
-                        Values((int) cv::CMP_EQ, (int) cv::CMP_GT, (int) cv::CMP_GE, (int) cv::CMP_LT, (int) cv::CMP_LE, (int) cv::CMP_NE),
+                        Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_32FC1, CV_32FC2, CV_64FC1),
                         WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -461,7 +1546,7 @@ PARAM_TEST_CASE(MeanStdDev, cv::gpu::DeviceInfo, UseRoi)
     cv::Scalar mean_gold;
     cv::Scalar stddev_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         useRoi = GET_PARAM(1);
@@ -471,18 +1556,18 @@ PARAM_TEST_CASE(MeanStdDev, cv::gpu::DeviceInfo, UseRoi)
         cv::RNG& rng = TS::ptr()->get_rng();
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
         mat = randomMat(rng, size, CV_8UC1, 1, 255, false);
 
         cv::meanStdDev(mat, mean_gold, stddev_gold);
     }
 };
 
-TEST_P(MeanStdDev, Accuracy) 
+TEST_P(MeanStdDev, Accuracy)
 {
     cv::Scalar mean;
     cv::Scalar stddev;
-    
+
     cv::gpu::meanStdDev(loadMat(mat, useRoi), mean, stddev);
 
     EXPECT_NEAR(mean_gold[0], mean[0], 1e-5);
@@ -514,7 +1599,7 @@ PARAM_TEST_CASE(NormDiff, cv::gpu::DeviceInfo, NormCode, UseRoi)
 
     double norm_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         normCode = GET_PARAM(1);
@@ -525,7 +1610,7 @@ PARAM_TEST_CASE(NormDiff, cv::gpu::DeviceInfo, NormCode, UseRoi)
         cv::RNG& rng = TS::ptr()->get_rng();
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
         mat1 = randomMat(rng, size, CV_8UC1, 1, 255, false);
         mat2 = randomMat(rng, size, CV_8UC1, 1, 255, false);
 
@@ -533,8 +1618,8 @@ PARAM_TEST_CASE(NormDiff, cv::gpu::DeviceInfo, NormCode, UseRoi)
     }
 };
 
-TEST_P(NormDiff, Accuracy) 
-{    
+TEST_P(NormDiff, Accuracy)
+{
     double norm = cv::gpu::norm(loadMat(mat1, useRoi), loadMat(mat2, useRoi), normCode);
 
     EXPECT_NEAR(norm_gold, norm, 1e-6);
@@ -560,7 +1645,7 @@ PARAM_TEST_CASE(Flip, cv::gpu::DeviceInfo, MatType, FlipCode, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -572,17 +1657,17 @@ PARAM_TEST_CASE(Flip, cv::gpu::DeviceInfo, MatType, FlipCode, UseRoi)
         cv::RNG& rng = TS::ptr()->get_rng();
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
         mat = randomMat(rng, size, type, 1, 255, false);
 
         cv::flip(mat, dst_gold, flip_code);
     }
 };
 
-TEST_P(Flip, Accuracy) 
-{    
+TEST_P(Flip, Accuracy)
+{
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat gpu_res;
 
     cv::gpu::flip(loadMat(mat, useRoi), gpu_res, flip_code);
@@ -613,7 +1698,7 @@ PARAM_TEST_CASE(LUT, cv::gpu::DeviceInfo, MatType, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -624,7 +1709,7 @@ PARAM_TEST_CASE(LUT, cv::gpu::DeviceInfo, MatType, UseRoi)
         cv::RNG& rng = TS::ptr()->get_rng();
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        
+
         mat = randomMat(rng, size, type, 1, 255, false);
         lut = randomMat(rng, cv::Size(256, 1), CV_8UC1, 100, 200, false);
 
@@ -632,10 +1717,10 @@ PARAM_TEST_CASE(LUT, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(LUT, Accuracy) 
+TEST_P(LUT, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat gpu_res;
 
     cv::gpu::LUT(loadMat(mat, useRoi), lut, gpu_res);
@@ -651,53 +1736,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, LUT, Combine(
                         WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
-// exp
-
-PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    bool useRoi;
-
-    cv::Size size;
-    cv::Mat mat;
-
-    cv::Mat dst_gold;
-
-    virtual void SetUp() 
-    {
-        devInfo = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-
-        mat = randomMat(rng, size, CV_32FC1, -10.0, 2.0, false);        
-
-        cv::exp(mat, dst_gold);
-    }
-};
-
-TEST_P(Exp, Accuracy) 
-{
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat gpu_res;
-
-    cv::gpu::exp(loadMat(mat, useRoi), gpu_res);
-
-    gpu_res.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                        ALL_DEVICES,
-                        WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
 // pow
 
 PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, MatType, UseRoi)
@@ -712,7 +1750,7 @@ PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, MatType, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -724,7 +1762,7 @@ PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, MatType, UseRoi)
 
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
 
-        mat = randomMat(rng, size, type, 0.0, 100.0, false);        
+        mat = randomMat(rng, size, type, 0.0, 100.0, false);
 
         if (mat.depth() == CV_32F)
             power = rng.uniform(1.2f, 3.f);
@@ -738,10 +1776,10 @@ PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(Pow, Accuracy) 
+TEST_P(Pow, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat gpu_res;
 
     cv::gpu::pow(loadMat(mat, useRoi), power, gpu_res);
@@ -757,53 +1795,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(
                         WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
-// log
-
-PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    bool useRoi;
-
-    cv::Size size;
-    cv::Mat mat;
-
-    cv::Mat dst_gold;
-
-    virtual void SetUp() 
-    {
-        devInfo = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-
-        mat = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);        
-
-        cv::log(mat, dst_gold);
-    }
-};
-
-TEST_P(Log, Accuracy) 
-{
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat gpu_res;
-
-    cv::gpu::log(loadMat(mat, useRoi), gpu_res);
-
-    gpu_res.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                        ALL_DEVICES,
-                        WHOLE_SUBMAT));
-
-////////////////////////////////////////////////////////////////////////////////
 // magnitude
 
 PARAM_TEST_CASE(Magnitude, cv::gpu::DeviceInfo, UseRoi)
@@ -816,7 +1807,7 @@ PARAM_TEST_CASE(Magnitude, cv::gpu::DeviceInfo, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         useRoi = GET_PARAM(1);
@@ -828,13 +1819,13 @@ PARAM_TEST_CASE(Magnitude, cv::gpu::DeviceInfo, UseRoi)
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
 
         mat1 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
-        mat2 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);       
+        mat2 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
 
         cv::magnitude(mat1, mat2, dst_gold);
     }
 };
 
-TEST_P(Magnitude, Accuracy) 
+TEST_P(Magnitude, Accuracy)
 {
     cv::Mat dst;
 
@@ -864,7 +1855,7 @@ PARAM_TEST_CASE(Phase, cv::gpu::DeviceInfo, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         useRoi = GET_PARAM(1);
@@ -876,16 +1867,16 @@ PARAM_TEST_CASE(Phase, cv::gpu::DeviceInfo, UseRoi)
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
 
         mat1 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
-        mat2 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);       
+        mat2 = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
 
         cv::phase(mat1, mat2, dst_gold);
     }
 };
 
-TEST_P(Phase, Accuracy) 
+TEST_P(Phase, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat gpu_res;
 
     cv::gpu::phase(loadMat(mat1, useRoi), loadMat(mat2, useRoi), gpu_res);
@@ -913,7 +1904,7 @@ PARAM_TEST_CASE(CartToPolar, cv::gpu::DeviceInfo, UseRoi)
     cv::Mat mag_gold;
     cv::Mat angle_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         useRoi = GET_PARAM(1);
@@ -925,16 +1916,16 @@ PARAM_TEST_CASE(CartToPolar, cv::gpu::DeviceInfo, UseRoi)
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
 
         mat1 = randomMat(rng, size, CV_32FC1, -100.0, 100.0, false);
-        mat2 = randomMat(rng, size, CV_32FC1, -100.0, 100.0, false);       
+        mat2 = randomMat(rng, size, CV_32FC1, -100.0, 100.0, false);
 
         cv::cartToPolar(mat1, mat2, mag_gold, angle_gold);
     }
 };
 
-TEST_P(CartToPolar, Accuracy) 
+TEST_P(CartToPolar, Accuracy)
 {
     cv::Mat mag, angle;
-    
+
     cv::gpu::GpuMat gpuMag;
     cv::gpu::GpuMat gpuAngle;
 
@@ -966,7 +1957,7 @@ PARAM_TEST_CASE(PolarToCart, cv::gpu::DeviceInfo, UseRoi)
     cv::Mat x_gold;
     cv::Mat y_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         useRoi = GET_PARAM(1);
@@ -978,13 +1969,13 @@ PARAM_TEST_CASE(PolarToCart, cv::gpu::DeviceInfo, UseRoi)
         size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
 
         mag = randomMat(rng, size, CV_32FC1, -100.0, 100.0, false);
-        angle = randomMat(rng, size, CV_32FC1, 0.0, 2.0 * CV_PI, false);       
+        angle = randomMat(rng, size, CV_32FC1, 0.0, 2.0 * CV_PI, false);
 
         cv::polarToCart(mag, angle, x_gold, y_gold);
     }
 };
 
-TEST_P(PolarToCart, Accuracy) 
+TEST_P(PolarToCart, Accuracy)
 {
     cv::Mat x, y;
 
@@ -1020,7 +2011,7 @@ PARAM_TEST_CASE(MinMax, cv::gpu::DeviceInfo, MatType, UseRoi)
     double minVal_gold;
     double maxVal_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1039,9 +2030,9 @@ PARAM_TEST_CASE(MinMax, cv::gpu::DeviceInfo, MatType, UseRoi)
         {
             cv::minMaxLoc(mat, &minVal_gold, &maxVal_gold, 0, 0, mask);
         }
-        else 
+        else
         {
-            // OpenCV's minMaxLoc doesn't support CV_8S type 
+            // OpenCV's minMaxLoc doesn't support CV_8S type
             minVal_gold = std::numeric_limits<double>::max();
             maxVal_gold = -std::numeric_limits<double>::max();
             for (int i = 0; i < mat.rows; ++i)
@@ -1050,11 +2041,11 @@ PARAM_TEST_CASE(MinMax, cv::gpu::DeviceInfo, MatType, UseRoi)
                 const unsigned char* mask_row = mask.ptr<unsigned char>(i);
                 for (int j = 0; j < mat.cols; ++j)
                 {
-                    if (mask_row[j]) 
-                    { 
+                    if (mask_row[j])
+                    {
                         signed char val = mat_row[j];
                         if (val < minVal_gold) minVal_gold = val;
-                        if (val > maxVal_gold) maxVal_gold = val; 
+                        if (val > maxVal_gold) maxVal_gold = val;
                     }
                 }
             }
@@ -1062,13 +2053,13 @@ PARAM_TEST_CASE(MinMax, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(MinMax, Accuracy) 
+TEST_P(MinMax, Accuracy)
 {
     if (type == CV_64F && !supportFeature(devInfo,  cv::gpu::NATIVE_DOUBLE))
         return;
 
     double minVal, maxVal;
-    
+
     cv::gpu::minMax(loadMat(mat, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
 
     EXPECT_DOUBLE_EQ(minVal_gold, minVal);
@@ -1098,7 +2089,7 @@ PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, MatType, UseRoi)
     cv::Point minLoc_gold;
     cv::Point maxLoc_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1117,9 +2108,9 @@ PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, MatType, UseRoi)
         {
             cv::minMaxLoc(mat, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold, mask);
         }
-        else 
+        else
         {
-            // OpenCV's minMaxLoc doesn't support CV_8S type 
+            // OpenCV's minMaxLoc doesn't support CV_8S type
             minVal_gold = std::numeric_limits<double>::max();
             maxVal_gold = -std::numeric_limits<double>::max();
             for (int i = 0; i < mat.rows; ++i)
@@ -1128,8 +2119,8 @@ PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, MatType, UseRoi)
                 const unsigned char* mask_row = mask.ptr<unsigned char>(i);
                 for (int j = 0; j < mat.cols; ++j)
                 {
-                    if (mask_row[j]) 
-                    { 
+                    if (mask_row[j])
+                    {
                         signed char val = mat_row[j];
                         if (val < minVal_gold) { minVal_gold = val; minLoc_gold = cv::Point(j, i); }
                         if (val > maxVal_gold) { maxVal_gold = val; maxLoc_gold = cv::Point(j, i); }
@@ -1140,24 +2131,24 @@ PARAM_TEST_CASE(MinMaxLoc, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(MinMaxLoc, Accuracy) 
+TEST_P(MinMaxLoc, Accuracy)
 {
     if (type == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
 
     double minVal, maxVal;
     cv::Point minLoc, maxLoc;
-    
+
     cv::gpu::minMaxLoc(loadMat(mat, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
 
     EXPECT_DOUBLE_EQ(minVal_gold, minVal);
     EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
 
-    int cmpMinVals = memcmp(mat.data + minLoc_gold.y * mat.step + minLoc_gold.x * mat.elemSize(), 
-                            mat.data + minLoc.y * mat.step + minLoc.x * mat.elemSize(), 
+    int cmpMinVals = memcmp(mat.data + minLoc_gold.y * mat.step + minLoc_gold.x * mat.elemSize(),
+                            mat.data + minLoc.y * mat.step + minLoc.x * mat.elemSize(),
                             mat.elemSize());
-    int cmpMaxVals = memcmp(mat.data + maxLoc_gold.y * mat.step + maxLoc_gold.x * mat.elemSize(), 
-                            mat.data + maxLoc.y * mat.step + maxLoc.x * mat.elemSize(), 
+    int cmpMaxVals = memcmp(mat.data + maxLoc_gold.y * mat.step + maxLoc_gold.x * mat.elemSize(),
+                            mat.data + maxLoc.y * mat.step + maxLoc.x * mat.elemSize(),
                             mat.elemSize());
 
     EXPECT_EQ(0, cmpMinVals);
@@ -1183,7 +2174,7 @@ PARAM_TEST_CASE(CountNonZero, cv::gpu::DeviceInfo, MatType, UseRoi)
 
     int n_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1202,7 +2193,7 @@ PARAM_TEST_CASE(CountNonZero, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(CountNonZero, Accuracy) 
+TEST_P(CountNonZero, Accuracy)
 {
     if (type == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
@@ -1229,7 +2220,7 @@ PARAM_TEST_CASE(Sum, cv::gpu::DeviceInfo, MatType, UseRoi)
     cv::Size size;
     cv::Mat mat;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1245,7 +2236,7 @@ PARAM_TEST_CASE(Sum, cv::gpu::DeviceInfo, MatType, UseRoi)
     }
 };
 
-TEST_P(Sum, Simple) 
+TEST_P(Sum, Simple)
 {
     if (type == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
@@ -1260,7 +2251,7 @@ TEST_P(Sum, Simple)
     EXPECT_NEAR(sum[3], sum_gold[3], mat.size().area() * 1e-5);
 }
 
-TEST_P(Sum, Abs) 
+TEST_P(Sum, Abs)
 {
     if (type == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
@@ -1275,7 +2266,7 @@ TEST_P(Sum, Abs)
     EXPECT_NEAR(sum[3], sum_gold[3], mat.size().area() * 1e-5);
 }
 
-TEST_P(Sum, Sqr) 
+TEST_P(Sum, Sqr)
 {
     if (type == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
@@ -1298,203 +2289,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
                         WHOLE_SUBMAT));
 
 //////////////////////////////////////////////////////////////////////////////
-// bitwise
-
-PARAM_TEST_CASE(Bitwise, cv::gpu::DeviceInfo, MatType)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-
-    cv::Size size;
-    cv::Mat mat1;
-    cv::Mat mat2;
-
-    virtual void SetUp() 
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-
-        mat1.create(size, type);
-        mat2.create(size, type);
-        
-        for (int i = 0; i < mat1.rows; ++i)
-        {
-            cv::Mat row1(1, static_cast<int>(mat1.cols * mat1.elemSize()), CV_8U, (void*)mat1.ptr(i));
-            rng.fill(row1, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
-
-            cv::Mat row2(1, static_cast<int>(mat2.cols * mat2.elemSize()), CV_8U, (void*)mat2.ptr(i));
-            rng.fill(row2, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
-        }
-    }
-};
-
-TEST_P(Bitwise, Not) 
-{
-    if (mat1.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-        return;
-
-    cv::Mat dst_gold = ~mat1;
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_not(loadMat(mat1), dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-TEST_P(Bitwise, Or) 
-{
-    if (mat1.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-        return;
-
-    cv::Mat dst_gold = mat1 | mat2;
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_or(loadMat(mat1), loadMat(mat2), dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-TEST_P(Bitwise, And) 
-{
-    if (mat1.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-        return;
-
-    cv::Mat dst_gold = mat1 & mat2;
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_and(loadMat(mat1), loadMat(mat2), dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-TEST_P(Bitwise, Xor) 
-{
-    if (mat1.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-        return;
-
-    cv::Mat dst_gold = mat1 ^ mat2;
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_xor(loadMat(mat1), loadMat(mat2), dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise, Combine(
-                        ALL_DEVICES,
-                        ALL_TYPES));
-
-PARAM_TEST_CASE(BitwiseScalar, cv::gpu::DeviceInfo, MatType)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-
-    cv::Size size;
-    cv::Mat mat;
-    cv::Scalar sc;
-
-    virtual void SetUp() 
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-
-        mat.create(size, type);
-        
-        for (int i = 0; i < mat.rows; ++i)
-        {
-            cv::Mat row(1, static_cast<int>(mat.cols * mat.elemSize()), CV_8U, (void*)mat.ptr(i));
-            rng.fill(row, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
-        }
-
-        sc = cv::Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255));
-    }
-};
-
-TEST_P(BitwiseScalar, Or) 
-{
-    cv::Mat dst_gold;
-    cv::bitwise_or(mat, sc, dst_gold);
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_or(loadMat(mat), sc, dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-TEST_P(BitwiseScalar, And) 
-{
-    cv::Mat dst_gold;
-    cv::bitwise_and(mat, sc, dst_gold);
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_and(loadMat(mat), sc, dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-TEST_P(BitwiseScalar, Xor) 
-{
-    cv::Mat dst_gold;
-    cv::bitwise_xor(mat, sc, dst_gold);
-
-    cv::Mat dst;
-    
-    cv::gpu::GpuMat dev_dst;
-
-    cv::gpu::bitwise_xor(loadMat(mat), sc, dev_dst);
-
-    dev_dst.download(dst);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalar, Combine(
-                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4)));
-
-//////////////////////////////////////////////////////////////////////////////
 // addWeighted
 
 PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, MatType, MatType, MatType, UseRoi)
@@ -1514,7 +2308,7 @@ PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, MatType, MatType, MatType, Use
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type1 = GET_PARAM(1);
@@ -1539,13 +2333,13 @@ PARAM_TEST_CASE(AddWeighted, cv::gpu::DeviceInfo, MatType, MatType, MatType, Use
     }
 };
 
-TEST_P(AddWeighted, Accuracy) 
+TEST_P(AddWeighted, Accuracy)
 {
     if ((src1.depth() == CV_64F || src2.depth() == CV_64F || dst_gold.depth() == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
         return;
 
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat dev_dst;
 
     cv::gpu::addWeighted(loadMat(src1, useRoi), alpha, loadMat(src2, useRoi), beta, gamma, dev_dst, dtype);
@@ -1570,7 +2364,7 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, MatType, int, ReduceOp, UseRoi)
     cv::gpu::DeviceInfo devInfo;
     int type;
     int dim;
-    int reduceOp;    
+    int reduceOp;
     bool useRoi;
 
     cv::Size size;
@@ -1578,7 +2372,7 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, MatType, int, ReduceOp, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1605,10 +2399,10 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, MatType, int, ReduceOp, UseRoi)
     }
 };
 
-TEST_P(Reduce, Accuracy) 
+TEST_P(Reduce, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat dev_dst;
 
     cv::gpu::reduce(loadMat(src, useRoi), dev_dst, dim, reduceOp, reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG ? CV_32F : CV_MAT_DEPTH(type));
@@ -1645,7 +2439,7 @@ PARAM_TEST_CASE(GEMM, cv::gpu::DeviceInfo, MatType, GemmFlags, UseRoi)
 
     cv::Mat dst_gold;
 
-    virtual void SetUp() 
+    virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
@@ -1668,10 +2462,10 @@ PARAM_TEST_CASE(GEMM, cv::gpu::DeviceInfo, MatType, GemmFlags, UseRoi)
     }
 };
 
-TEST_P(GEMM, Accuracy) 
+TEST_P(GEMM, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat dev_dst;
 
     cv::gpu::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dev_dst, flags);
diff --git a/modules/gpu/test/utility.hpp b/modules/gpu/test/utility.hpp
index 76fcb6e..c551cee 100644
--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
@@ -162,10 +162,37 @@ CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX
 #define ALL_DEVICES testing::ValuesIn(devices())
 #define DEVICES(feature) testing::ValuesIn(devices(feature))
 
+#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
+
+#define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
 #define ALL_TYPES testing::ValuesIn(all_types())
 #define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
 
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
+#define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
+                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)),  \
+                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)),  \
+                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_32S)),  \
+                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_32F)),  \
+                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_64F)),  \
+                                                                                        \
+                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_16U)), \
+                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_32S)), \
+                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_32F)), \
+                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_64F)), \
+                                                                                        \
+                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_16S)), \
+                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_32S)), \
+                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_32F)), \
+                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_64F)), \
+                                                                                        \
+                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_32S)), \
+                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_32F)), \
+                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_64F)), \
+                                                                                        \
+                                    std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)), \
+                                    std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
+                                                                                        \
+                                    std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
 
 #define WHOLE testing::Values(UseRoi(false))
 #define SUBMAT testing::Values(UseRoi(true))
@@ -173,4 +200,6 @@ CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX
 
 #define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
 
+#define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))
+
 #endif // __OPENCV_TEST_UTILITY_HPP__
-- 
2.7.4