From ed749c4bfebe5a6364b4bc639ea34d22de9f8fdf Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 10 Oct 2011 08:19:11 +0000 Subject: [PATCH] added missing arithm operations to gpu module --- modules/gpu/include/opencv2/gpu/gpu.hpp | 64 +- modules/gpu/src/cuda/element_operations.cu | 1342 ++++++++++++++++++++++++---- modules/gpu/src/element_operations.cpp | 770 +++++++++++----- modules/gpu/src/filtering.cpp | 6 +- modules/gpu/test/test_arithm.cpp | 22 +- modules/gpu/test/test_imgproc.cpp | 4 +- 6 files changed, 1781 insertions(+), 427 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 51aadeb..e57ccd2 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -539,32 +539,41 @@ namespace cv //////////////////////////// Per-element operations //////////////////////////////////// //! adds one matrix to another (c = a + b) - //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); + CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); //! adds scalar to a matrix (c = a + s) - //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); + CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); //! subtracts one matrix from another (c = a - b) - //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); + CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); //! subtracts scalar from a matrix (c = a - s) - //! supports CV_32FC1 and CV_32FC2 type - CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); - - //! computes element-wise product of the two arrays (c = a * b) - //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); - //! multiplies matrix to a scalar (c = a * s) - //! supports CV_32FC1 type - CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); - - //! computes element-wise quotient of the two arrays (c = a / b) - //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); - //! computes element-wise quotient of matrix and scalar (c = a / s) - //! supports CV_32FC1 type - CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null()); + CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); + + //! computes element-wise weighted product of the two arrays (c = scale * a * b) + CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); + //! weighted multiplies matrix to a scalar (c = scale * a * s) + CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); + + //! computes element-wise weighted quotient of the two arrays (c = a / b) + CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); + //! computes element-wise weighted quotient of matrix and scalar (c = a / s) + CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); + //! computes element-wise weighted reciprocal of an array (dst = scale/src2) + CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null()); + + //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma) + CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, + int dtype = -1, Stream& stream = Stream::Null()); + + //! adds scaled array to another one (dst = alpha*src1 + src2) + static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()) + { + addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream); + } + + //! computes element-wise absolute difference of two arrays (c = abs(a - b)) + CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); + //! computes element-wise absolute difference of array and scalar (c = abs(a - s)) + CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); //! computes exponent of each matrix element (b = e**a) //! supports only CV_32FC1 type @@ -580,13 +589,6 @@ namespace cv //! supports only CV_32FC1 type CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); - //! computes element-wise absolute difference of two arrays (c = abs(a - b)) - //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types - CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); - //! computes element-wise absolute difference of array and scalar (c = abs(a - s)) - //! supports only CV_32FC1 type - CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); - //! compares elements of two arrays (c = a b) //! supports CV_8UC4, CV_32FC1 types CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null()); @@ -615,10 +617,6 @@ namespace cv //! computes per-element maximum of array and scalar (dst = max(src1, src2)) CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); - //! computes the weighted sum of two arrays - CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, - int dtype = -1, Stream& stream = Stream::Null()); - ////////////////////////////// Image processing ////////////////////////////// diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 70f3bab..ce9281c 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -49,32 +49,1197 @@ namespace cv { namespace gpu { namespace device { + ////////////////////////////////////////////////////////////////////////// + // add + + template struct Add : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a + b); + } + }; + + template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream) + { + if (mask.data) + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, mask, Add(), stream); + else + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, Add(), stream); + } + + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + template struct AddScalar : unary_function + { + AddScalar(double val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(a + val); + } + const double val; + }; + + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&val) ); + AddScalar op(val); + if (mask.data) + transform((DevMem2D_)src1, (DevMem2D_)dst, mask, op, stream); + else + transform((DevMem2D_)src1, (DevMem2D_)dst, op, stream); + } + + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void add_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + ////////////////////////////////////////////////////////////////////////// + // subtract + + template struct Subtract : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a - b); + } + }; + + template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream) + { + if (mask.data) + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, mask, Subtract(), stream); + else + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, Subtract(), stream); + } + + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + template struct SubtractScalar : unary_function + { + SubtractScalar(double val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(a - val); + } + const double val; + }; + + template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&val) ); + SubtractScalar op(val); + if (mask.data) + transform((DevMem2D_)src1, (DevMem2D_)dst, mask, op, stream); + else + transform((DevMem2D_)src1, (DevMem2D_)dst, op, stream); + } + + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + //template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + template void subtract_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, const PtrStep& mask, cudaStream_t stream); + + ////////////////////////////////////////////////////////////////////////// + // multiply + + struct multiply_8uc4_32f : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, float b) const + { + uint res = 0; + + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + + return res; + } + }; + + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + void multiply_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src1), src2, static_cast< DevMem2D_ >(dst), multiply_8uc4_32f(), stream); + } + + struct multiply_16sc4_32f : binary_function + { + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), + saturate_cast(a.z * b), saturate_cast(a.w * b)); + } + }; + + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + void multiply_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src1), src2, + static_cast< DevMem2D_ >(dst), multiply_16sc4_32f(), stream); + } + + template struct Multiply : binary_function + { + Multiply(double scale_) : scale(scale_) {} + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(scale * a * b); + } + const double scale; + }; + + template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&scale) ); + Multiply op(scale); + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, op, stream); + } + + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + template struct MultiplyScalar : unary_function + { + MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(scale * a * val); + } + const double val; + const double scale; + }; + + template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&val) ); + cudaSafeCall( cudaSetDoubleForDevice(&scale) ); + MultiplyScalar op(val, scale); + transform((DevMem2D_)src1, (DevMem2D_)dst, op, stream); + } + + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void multiply_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + ////////////////////////////////////////////////////////////////////////// + // divide + + struct divide_8uc4_32f : binary_function + { + __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const + { + return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), + saturate_cast(a.z / b), saturate_cast(a.w / b)) + : make_uchar4(0,0,0,0); + } + }; + + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + void divide_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src1), src2, static_cast< DevMem2D_ >(dst), divide_8uc4_32f(), stream); + } + + + struct divide_16sc4_32f : binary_function + { + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), + saturate_cast(a.z / b), saturate_cast(a.w / b)) + : make_short4(0,0,0,0); + } + }; + + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; + + void divide_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src1), src2, static_cast< DevMem2D_ >(dst), divide_16sc4_32f(), stream); + } + + template struct Divide : binary_function + { + Divide(double scale_) : scale(scale_) {} + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(scale * a / b) : 0; + } + const double scale; + }; + + template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&scale) ); + Divide op(scale); + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, op, stream); + } + + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, double scale, cudaStream_t stream); + + template struct DivideScalar : unary_function + { + DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(scale * a / val); + } + const double val; + const double scale; + }; + + template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&val) ); + cudaSafeCall( cudaSetDoubleForDevice(&scale) ); + DivideScalar op(val, scale); + transform((DevMem2D_)src1, (DevMem2D_)dst, op, stream); + } + + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + //template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + template void divide_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, double scale, cudaStream_t stream); + + template struct Reciprocal : unary_function + { + Reciprocal(double scale_) : scale(scale_) {} + __device__ __forceinline__ D operator ()(T a) const + { + return a != 0 ? saturate_cast(scale / a) : 0; + } + const double scale; + }; + + template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); + Reciprocal op(scalar); + transform((DevMem2D_)src2, (DevMem2D_)dst, op, stream); + } + + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void divide_gpu(double scalar, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + ////////////////////////////////////////////////////////////////////////// + // absdiff + + template struct Absdiff : binary_function + { + static __device__ __forceinline__ int abs(int a) + { + return ::abs(a); + } + static __device__ __forceinline__ float abs(float a) + { + return ::fabsf(a); + } + static __device__ __forceinline__ double abs(double a) + { + return ::fabs(a); + } + + __device__ __forceinline__ T operator ()(T a, T b) const + { + return saturate_cast(abs(a - b)); + } + }; + + template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) + { + transform((DevMem2D_)src1, (DevMem2D_)src2, (DevMem2D_)dst, Absdiff(), stream); + } + + //template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + //template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + + template struct AbsdiffScalar : unary_function + { + AbsdiffScalar(double val_) : val(val_) {} + __device__ __forceinline__ T operator ()(T a) const + { + return saturate_cast(::fabs(a - val)); + } + double val; + }; + + template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template void absdiff_gpu(const DevMem2D& src1, double val, const DevMem2D& dst, cudaStream_t stream) + { + cudaSafeCall( cudaSetDoubleForDevice(&val) ); + AbsdiffScalar op(val); + transform((DevMem2D_)src1, (DevMem2D_)dst, op, stream); + } + + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + //template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void absdiff_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + ////////////////////////////////////////////////////////////////////////////////////// // Compare + template struct Equal : binary_function + { + __device__ __forceinline__ uchar operator()(T src1, T src2) const + { + return static_cast((src1 == src2) * 255); + } + }; template struct NotEqual : binary_function { __device__ __forceinline__ uchar operator()(T src1, T src2) const { - return static_cast(static_cast(src1 != src2) * 255); + return static_cast((src1 != src2) * 255); + } + }; + template struct Less : binary_function + { + __device__ __forceinline__ uchar operator()(T src1, T src2) const + { + return static_cast((src1 < src2) * 255); + } + }; + template struct LessEqual : binary_function + { + __device__ __forceinline__ uchar operator()(T src1, T src2) const + { + return static_cast((src1 <= src2) * 255); } }; - template - inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< Equal > : DefaultTransformFunctorTraits< Equal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Equal > : DefaultTransformFunctorTraits< Equal > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< NotEqual > : DefaultTransformFunctorTraits< NotEqual > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< NotEqual > : DefaultTransformFunctorTraits< NotEqual > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Less > : DefaultTransformFunctorTraits< Less > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Less > : DefaultTransformFunctorTraits< Less > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< LessEqual > : DefaultTransformFunctorTraits< LessEqual > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< LessEqual > : DefaultTransformFunctorTraits< LessEqual > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template