From 7628e57fc62a46b788790c698bd7e06aba92d515 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Jul 2013 16:43:37 +0400
Subject: [PATCH] used new device layer for cv::gpu::absdiff

---
 modules/cudaarithm/src/cuda/absdiff_mat.cu    | 159 ++++++++++++++++----------
 modules/cudaarithm/src/cuda/absdiff_scalar.cu |  76 ++++++------
 modules/cudaarithm/src/element_operations.cpp | 100 +---------------
 3 files changed, 146 insertions(+), 189 deletions(-)
diff --git a/modules/cudaarithm/src/cuda/absdiff_mat.cu b/modules/cudaarithm/src/cuda/absdiff_mat.cu
index ffdcbfa..0f93b00 100644
--- a/modules/cudaarithm/src/cuda/absdiff_mat.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_mat.cu
@@ -40,43 +40,22 @@
 //
 //M*/
 
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
 
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV
 
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
 
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else
 
-namespace arithm
-{
-    struct VAbsDiff4 : binary_function<uint, uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vabsdiff4(a, b);
-        }
-
-        __host__ __device__ __forceinline__ VAbsDiff4() {}
-        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
-    };
+#include "opencv2/cudev.hpp"
 
-    struct VAbsDiff2 : binary_function<uint, uint, uint>
-    {
-        __device__ __forceinline__ uint operator ()(uint a, uint b) const
-        {
-            return vabsdiff2(a, b);
-        }
+using namespace cv::cudev;
 
-        __host__ __device__ __forceinline__ VAbsDiff2() {}
-        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
-    };
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
 
+namespace
+{
     __device__ __forceinline__ int _abs(int a)
     {
         return ::abs(a);
@@ -90,58 +69,120 @@ namespace arithm
         return ::fabs(a);
     }
 
-    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
+    template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
     {
         __device__ __forceinline__ T operator ()(T a, T b) const
         {
             return saturate_cast<T>(_abs(a - b));
         }
-
-        __host__ __device__ __forceinline__ AbsDiffMat() {}
-        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
     };
-}
 
-namespace cv { namespace cuda { namespace device
-{
-    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
     {
     };
-
-    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
     {
+        enum {
+            shift = 1
+        };
     };
 
-    template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <typename T>
+    void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
     {
+        gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
+    }
+
+    struct AbsDiffOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff2(a, b);
+        }
     };
-}}}
 
-namespace arithm
-{
-    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
     {
-        device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 1;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
     }
 
-    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    struct AbsDiffOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff4(a, b);
+        }
+    };
+
+    void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
     {
-        device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
     }
+}
 
-    template <typename T>
-    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        absDiffMat_v1<uchar>,
+        absDiffMat_v1<schar>,
+        absDiffMat_v1<ushort>,
+        absDiffMat_v1<short>,
+        absDiffMat_v1<int>,
+        absDiffMat_v1<float>,
+        absDiffMat_v1<double>
+    };
+
+    const int depth = src1.depth();
+
+    CV_DbgAssert( depth < 7 );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (depth == CV_8U || depth == CV_16U)
     {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                absDiffMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                absDiffMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
     }
 
-    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
 }
 
-#endif // CUDA_DISABLER
+#endif
diff --git a/modules/cudaarithm/src/cuda/absdiff_scalar.cu b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
index ad30bff..f6cebda 100644
--- a/modules/cudaarithm/src/cuda/absdiff_scalar.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
@@ -40,59 +40,71 @@
 //
 //M*/
 
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
 
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV
 
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
 
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else
 
-namespace arithm
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+
+namespace
 {
-    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
     {
         S val;
 
-        __host__ explicit AbsDiffScalar(S val_) : val(val_) {}
-
         __device__ __forceinline__ T operator ()(T a) const
         {
             abs_func<S> f;
             return saturate_cast<T>(f(a - val));
         }
     };
-}
 
-namespace cv { namespace cuda { namespace device
-{
-    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
     {
+        enum {
+            shift = 1
+        };
     };
-}}}
 
-namespace arithm
+    template <typename SrcType, typename ScalarDepth>
+    void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
+    {
+        AbsDiffScalarOp<SrcType, ScalarDepth> op;
+        op.val = static_cast<ScalarDepth>(value);
+        gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
+    }
+}
+
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
 {
-    template <typename T, typename S>
-    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
     {
-        AbsDiffScalar<T, S> op(static_cast<S>(val));
+        absDiffScalarImpl<uchar, float>,
+        absDiffScalarImpl<schar, float>,
+        absDiffScalarImpl<ushort, float>,
+        absDiffScalarImpl<short, float>,
+        absDiffScalarImpl<int, float>,
+        absDiffScalarImpl<float, float>,
+        absDiffScalarImpl<double, double>
+    };
 
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
-    }
+    const int depth = src.depth();
+
+    CV_DbgAssert( depth < 7 );
 
-    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    funcs[depth](src, val[0], dst, stream);
 }
 
-#endif // CUDA_DISABLER
+#endif
diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp
index b846520..e670061 100644
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -442,105 +442,9 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub
 //////////////////////////////////////////////////////////////////////////////
 // absdiff
 
-namespace arithm
-{
-    void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
-    void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
-
-    template <typename T>
-    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-}
-
-static void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        arithm::absDiffMat<unsigned char>,
-        arithm::absDiffMat<signed char>,
-        arithm::absDiffMat<unsigned short>,
-        arithm::absDiffMat<short>,
-        arithm::absDiffMat<int>,
-        arithm::absDiffMat<float>,
-        arithm::absDiffMat<double>
-    };
-
-    const int depth = src1.depth();
-    const int cn = src1.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (depth == CV_8U || depth == CV_16U)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
-        {
-            if (depth == CV_8U && (src1_.cols & 3) == 0)
-            {
-                const int vcols = src1_.cols >> 2;
-
-                arithm::absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                      stream);
-
-                return;
-            }
-            else if (depth == CV_16U && (src1_.cols & 1) == 0)
-            {
-                const int vcols = src1_.cols >> 1;
-
-                arithm::absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                      stream);
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
 
-                return;
-            }
-        }
-    }
-
-    const func_t func = funcs[depth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src1_, src2_, dst_, stream);
-}
-
-namespace arithm
-{
-    template <typename T, typename S>
-    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-static void absDiffScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        arithm::absDiffScalar<unsigned char, float>,
-        arithm::absDiffScalar<signed char, float>,
-        arithm::absDiffScalar<unsigned short, float>,
-        arithm::absDiffScalar<short, float>,
-        arithm::absDiffScalar<int, float>,
-        arithm::absDiffScalar<float, float>,
-        arithm::absDiffScalar<double, double>
-    };
-
-    const int depth = src.depth();
-
-    funcs[depth](src, val[0], dst, StreamAccessor::getStream(stream));
-}
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
 
 void cv::cuda::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
 {
-- 
2.7.4