From 6dfd8f184c14f6f286a11bcfe9bc9a98c554e6e2 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 22 Jul 2013 13:07:07 +0400 Subject: [PATCH] rewrote matrix operations with cudev module --- modules/core/CMakeLists.txt | 4 +- modules/core/src/cuda/gpu_mat.cu | 486 +++++++++++++++ modules/core/src/cuda/matrix_operations.cu | 296 ---------- modules/core/src/cuda/matrix_operations.hpp | 57 -- modules/core/src/gpu_mat.cpp | 878 ++++------------------------ 5 files changed, 586 insertions(+), 1135 deletions(-) create mode 100644 modules/core/src/cuda/gpu_mat.cu delete mode 100644 modules/core/src/cuda/matrix_operations.cu delete mode 100644 modules/core/src/cuda/matrix_operations.hpp diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 8b3c6c7..09644d5 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,9 +1,9 @@ set(the_description "The Core Functionality") -ocv_add_module(core ${ZLIB_LIBRARIES}) +ocv_add_module(core ${ZLIB_LIBRARIES} OPTIONAL opencv_cudev) ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) if(HAVE_CUDA) - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function) endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu new file mode 100644 index 0000000..0db1584 --- /dev/null +++ b/modules/core/src/cuda/gpu_mat.cu @@ -0,0 +1,486 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/opencv_modules.hpp" + +#ifndef HAVE_OPENCV_CUDEV + +#error "opencv_cudev is required" + +#else + +#include "opencv2/core/gpu.hpp" +#include "opencv2/cudev.hpp" + +using namespace cv; +using namespace cv::gpu; +using namespace cv::cudev; + +///////////////////////////////////////////////////// +/// create + +void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) +{ + CV_DbgAssert( _rows >= 0 && _cols >= 0 ); + + _type &= Mat::TYPE_MASK; + + if (rows == _rows && cols == _cols && type() == _type && data) + return; + + if (data) + release(); + + if (_rows > 0 && _cols > 0) + { + flags = Mat::MAGIC_VAL + _type; + rows = _rows; + cols = _cols; + + size_t esz = elemSize(); + + void* devPtr; + + if (rows > 1 && cols > 1) + { + CV_CUDEV_SAFE_CALL( cudaMallocPitch(&devPtr, &step, esz * cols, rows) ); + } + else + { + // Single row or single column must be continuous + CV_CUDEV_SAFE_CALL( cudaMalloc(&devPtr, esz * cols * rows) ); + step = esz * cols; + } + + if (esz * cols == step) + flags |= Mat::CONTINUOUS_FLAG; + + int64 _nettosize = static_cast(step) * rows; + size_t nettosize = static_cast(_nettosize); + + datastart = data = static_cast(devPtr); + dataend = data + nettosize; + + refcount = static_cast(fastMalloc(sizeof(*refcount))); + *refcount = 1; + } +} + +///////////////////////////////////////////////////// +/// release + +void cv::gpu::GpuMat::release() +{ + if (refcount && CV_XADD(refcount, -1) == 1) + { + cudaFree(datastart); + fastFree(refcount); + } + + data = datastart = dataend = 0; + step = rows = cols = 0; + refcount = 0; +} + +///////////////////////////////////////////////////// +/// upload + +void cv::gpu::GpuMat::upload(InputArray arr) +{ + Mat mat = arr.getMat(); + + CV_DbgAssert( !mat.empty() ); + + create(mat.size(), mat.type()); + + CV_CUDEV_SAFE_CALL( cudaMemcpy2D(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) ); +} + +void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream) +{ + Mat mat = arr.getMat(); + + CV_DbgAssert( !mat.empty() ); + + create(mat.size(), mat.type()); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice, stream) ); +} + +///////////////////////////////////////////////////// +/// download + +void cv::gpu::GpuMat::download(OutputArray _dst) const +{ + CV_DbgAssert( !empty() ); + + _dst.create(size(), type()); + Mat dst = _dst.getMat(); + + CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) ); +} + +void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const +{ + CV_DbgAssert( !empty() ); + + _dst.create(size(), type()); + Mat dst = _dst.getMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost, stream) ); +} + +///////////////////////////////////////////////////// +/// copyTo + +void cv::gpu::GpuMat::copyTo(OutputArray _dst) const +{ + CV_DbgAssert( !empty() ); + + _dst.create(size(), type()); + GpuMat dst = _dst.getGpuMat(); + + CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); +} + +void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const +{ + CV_DbgAssert( !empty() ); + + _dst.create(size(), type()); + GpuMat dst = _dst.getGpuMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice, stream) ); +} + +namespace +{ + template struct CopyToPolicy : DefaultTransformPolicy + { + }; + template <> struct CopyToPolicy<4> : DefaultTransformPolicy + { + enum { + shift = 2 + }; + }; + template <> struct CopyToPolicy<8> : DefaultTransformPolicy + { + enum { + shift = 1 + }; + }; + + template + void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream) + { + gridTransform_< CopyToPolicy::elem_type)> >(globPtr(src), globPtr(dst), identity(), globPtr(mask), stream); + } +} + +void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& stream) const +{ + CV_DbgAssert( !empty() ); + CV_DbgAssert( depth() <= CV_64F && channels() <= 4 ); + + GpuMat mask = _mask.getGpuMat(); + CV_DbgAssert( size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == channels()) ); + + _dst.create(size(), type()); + GpuMat dst = _dst.getGpuMat(); + + typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream); + static const func_t funcs[9][4] = + { + {0,0,0,0}, + {copyWithMask, copyWithMask, copyWithMask, copyWithMask}, + {copyWithMask, copyWithMask, copyWithMask, copyWithMask}, + {0,0,0,0}, + {copyWithMask, copyWithMask, copyWithMask, copyWithMask}, + {0,0,0,0}, + {0,0,0,0}, + {0,0,0,0}, + {copyWithMask, copyWithMask, copyWithMask, copyWithMask} + }; + + if (mask.channels() == channels()) + { + const func_t func = funcs[elemSize1()][0]; + CV_DbgAssert( func != 0 ); + func(reshape(1), dst.reshape(1), mask.reshape(1), stream); + } + else + { + const func_t func = funcs[elemSize1()][channels() - 1]; + CV_DbgAssert( func != 0 ); + func(*this, dst, mask, stream); + } +} + +///////////////////////////////////////////////////// +/// setTo + +namespace +{ + template + void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream) + { + Scalar_::elem_type> scalar = _scalar; + gridTransform(constantPtr(VecTraits::make(scalar.val), mat.rows, mat.cols), globPtr(mat), identity(), stream); + } + + template + void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream) + { + Scalar_::elem_type> scalar = _scalar; + gridTransform(constantPtr(VecTraits::make(scalar.val), mat.rows, mat.cols), globPtr(mat), identity(), globPtr(mask), stream); + } +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar value, Stream& stream) +{ + CV_DbgAssert( !empty() ); + CV_DbgAssert( depth() <= CV_64F && channels() <= 4 ); + + if (value[0] == 0.0 && value[1] == 0.0 && value[2] == 0.0 && value[3] == 0.0) + { + // Zero fill + + if (stream) + CV_CUDEV_SAFE_CALL( cudaMemset2DAsync(data, step, 0, cols * elemSize(), rows, StreamAccessor::getStream(stream)) ); + else + CV_CUDEV_SAFE_CALL( cudaMemset2D(data, step, 0, cols * elemSize(), rows) ); + + return *this; + } + + if (depth() == CV_8U) + { + const int cn = channels(); + + if (cn == 1 + || (cn == 2 && value[0] == value[1]) + || (cn == 3 && value[0] == value[1] && value[0] == value[2]) + || (cn == 4 && value[0] == value[1] && value[0] == value[2] && value[0] == value[3])) + { + const int val = cv::saturate_cast(value[0]); + + if (stream) + CV_CUDEV_SAFE_CALL( cudaMemset2DAsync(data, step, val, cols * elemSize(), rows, StreamAccessor::getStream(stream)) ); + else + CV_CUDEV_SAFE_CALL( cudaMemset2D(data, step, val, cols * elemSize(), rows) ); + + return *this; + } + } + + typedef void (*func_t)(const GpuMat& mat, Scalar scalar, Stream& stream); + static const func_t funcs[7][4] = + { + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask}, + {setToWithOutMask,setToWithOutMask,setToWithOutMask,setToWithOutMask} + }; + + funcs[depth()][channels() - 1](*this, value, stream); + + return *this; +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream) +{ + CV_DbgAssert( !empty() ); + CV_DbgAssert( depth() <= CV_64F && channels() <= 4 ); + + GpuMat mask = _mask.getGpuMat(); + + CV_DbgAssert( size() == mask.size() && mask.type() == CV_8UC1 ); + + typedef void (*func_t)(const GpuMat& mat, const GpuMat& mask, Scalar scalar, Stream& stream); + static const func_t funcs[7][4] = + { + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask}, + {setToWithMask,setToWithMask,setToWithMask,setToWithMask} + }; + + funcs[depth()][channels() - 1](*this, mask, value, stream); + + return *this; +} + +///////////////////////////////////////////////////// +/// convertTo + +namespace +{ + template struct ConvertToPolicy : DefaultTransformPolicy + { + }; + template <> struct ConvertToPolicy : DefaultTransformPolicy + { + enum { + shift = 1 + }; + }; + + template + void convertToNoScale(const GpuMat& src, const GpuMat& dst, Stream& stream) + { + typedef typename VecTraits::elem_type src_elem_type; + typedef typename VecTraits::elem_type dst_elem_type; + typedef typename LargerType::type larger_elem_type; + typedef typename LargerType::type scalar_type; + + gridTransform_< ConvertToPolicy >(globPtr(src), globPtr(dst), saturate_cast_func(), stream); + } + + template struct Convertor : unary_function + { + S alpha; + S beta; + + __device__ __forceinline__ D operator ()(typename TypeTraits::parameter_type src) const + { + return cudev::saturate_cast(alpha * src + beta); + } + }; + + template + void convertToScale(const GpuMat& src, const GpuMat& dst, double alpha, double beta, Stream& stream) + { + typedef typename VecTraits::elem_type src_elem_type; + typedef typename VecTraits::elem_type dst_elem_type; + typedef typename LargerType::type larger_elem_type; + typedef typename LargerType::type scalar_type; + + Convertor op; + op.alpha = cv::saturate_cast(alpha); + op.beta = cv::saturate_cast(beta); + + gridTransform_< ConvertToPolicy >(globPtr(src), globPtr(dst), op, stream); + } +} + +void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const +{ + if (rtype < 0) + rtype = type(); + else + rtype = CV_MAKE_TYPE(CV_MAT_DEPTH(rtype), channels()); + + const int sdepth = depth(); + const int ddepth = CV_MAT_DEPTH(rtype); + if (sdepth == ddepth) + { + if (stream) + copyTo(_dst, stream); + else + copyTo(_dst); + + return; + } + + CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F ); + + GpuMat src = *this; + + _dst.create(size(), rtype); + GpuMat dst = _dst.getGpuMat(); + + typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream); + static const func_t funcs[7][7] = + { + {0, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale}, + {convertToNoScale, 0, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale}, + {convertToNoScale, convertToNoScale, 0, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale}, + {convertToNoScale, convertToNoScale, convertToNoScale, 0, convertToNoScale, convertToNoScale, convertToNoScale}, + {convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, 0, convertToNoScale, convertToNoScale}, + {convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, 0, convertToNoScale}, + {convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, convertToNoScale, 0} + }; + + funcs[sdepth][ddepth](reshape(1), dst.reshape(1), stream); +} + +void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& stream) const +{ + if (rtype < 0) + rtype = type(); + else + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + + const int sdepth = depth(); + const int ddepth = CV_MAT_DEPTH(rtype); + + GpuMat src = *this; + + _dst.create(size(), rtype); + GpuMat dst = _dst.getGpuMat(); + + typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, double alpha, double beta, Stream& stream); + static const func_t funcs[7][7] = + { + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale}, + {convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale, convertToScale} + }; + + funcs[sdepth][ddepth](reshape(1), dst.reshape(1), alpha, beta, stream); +} + +#endif diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu deleted file mode 100644 index 7de5205..0000000 --- a/modules/core/src/cuda/matrix_operations.cu +++ /dev/null @@ -1,296 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/type_traits.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" - -#include "matrix_operations.hpp" - -namespace cv { namespace gpu { namespace cudev -{ - /////////////////////////////////////////////////////////////////////////// - // copyWithMask - - template - void copyWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream) - { - if (multiChannelMask) - cv::gpu::cudev::transform((PtrStepSz) src, (PtrStepSz) dst, identity(), SingleMask(mask), stream); - else - cv::gpu::cudev::transform((PtrStepSz) src, (PtrStepSz) dst, identity(), SingleMaskChannels(mask, cn), stream); - } - - void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream) - { - typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream); - - static const func_t tab[] = - { - 0, - copyWithMask, - copyWithMask, - 0, - copyWithMask, - 0, - 0, - 0, - copyWithMask - }; - - const func_t func = tab[elemSize1]; - CV_DbgAssert( func != 0 ); - - func(src, dst, cn, mask, multiChannelMask, stream); - } - - /////////////////////////////////////////////////////////////////////////// - // set - - template - __global__ void set(PtrStepSz mat, const Mask mask, const int channels, const typename TypeVec::vec_type value) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= mat.cols * channels || y >= mat.rows) - return; - - const T scalar[4] = {value.x, value.y, value.z, value.w}; - - if (mask(y, x / channels)) - mat(y, x) = scalar[x % channels]; - } - - template - void set(PtrStepSz mat, const T* scalar, int channels, cudaStream_t stream) - { - typedef typename TypeVec::vec_type scalar_t; - - dim3 block(32, 8); - dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y)); - - set<<>>(mat, WithOutMask(), channels, VecTraits::make(scalar)); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall ( cudaDeviceSynchronize() ); - } - - template void set(PtrStepSz mat, const uchar* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const schar* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const ushort* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const short* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const int* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const float* scalar, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const double* scalar, int channels, cudaStream_t stream); - - template - void set(PtrStepSz mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream) - { - typedef typename TypeVec::vec_type scalar_t; - - dim3 block(32, 8); - dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y)); - - set<<>>(mat, SingleMask(mask), channels, VecTraits::make(scalar)); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall ( cudaDeviceSynchronize() ); - } - - template void set(PtrStepSz mat, const uchar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const schar* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const short* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const int* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const float* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - template void set(PtrStepSz mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - /////////////////////////////////////////////////////////////////////////// - // convert - - template struct Convertor : unary_function - { - Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {} - - __device__ __forceinline__ D operator()(typename TypeTraits::ParameterType src) const - { - return saturate_cast(alpha * src + beta); - } - - S alpha, beta; - }; - - namespace detail - { - template struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits - { - }; - template struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits - { - enum { smart_shift = 8 }; - }; - template struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits - { - enum { smart_shift = 4 }; - }; - template struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits - { - enum { smart_shift = 4 }; - }; - template struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits - { - enum { smart_shift = 2 }; - }; - - template struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 2 }; - }; - - template struct ConvertTraits : ConvertTraitsDispatcher - { - }; - } - - template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > - { - }; - - template - void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) - { - Convertor op(static_cast(alpha), static_cast(beta)); - cv::gpu::cudev::transform((PtrStepSz)src, (PtrStepSz)dst, op, WithOutMask(), stream); - } - - void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); - - static const caller_t tab[7][7] = - { - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - }, - { - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_, - cvt_ - } - }; - - const caller_t func = tab[sdepth][ddepth]; - func(src, dst, alpha, beta, stream); - } -}}} // namespace cv { namespace gpu { namespace cudev diff --git a/modules/core/src/cuda/matrix_operations.hpp b/modules/core/src/cuda/matrix_operations.hpp deleted file mode 100644 index 4e45106..0000000 --- a/modules/core/src/cuda/matrix_operations.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "opencv2/core/cuda/common.hpp" - -namespace cv { namespace gpu { namespace cudev -{ - void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream); - - template - void set(PtrStepSz mat, const T* scalar, int channels, cudaStream_t stream); - - template - void set(PtrStepSz mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); -}}} diff --git a/modules/core/src/gpu_mat.cpp b/modules/core/src/gpu_mat.cpp index a2e8da6..33a6046 100644 --- a/modules/core/src/gpu_mat.cpp +++ b/modules/core/src/gpu_mat.cpp @@ -46,504 +46,6 @@ using namespace cv; using namespace cv::gpu; -/////////////////////////// matrix operations ///////////////////////// - -#ifdef HAVE_CUDA - -// CUDA implementation - -#include "cuda/matrix_operations.hpp" - -namespace -{ - template void cudaSet_(GpuMat& src, Scalar s, cudaStream_t stream) - { - Scalar_ sf = s; - cudev::set(PtrStepSz(src), sf.val, src.channels(), stream); - } - - void cudaSet(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream); - static const func_t funcs[] = - { - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_ - }; - - funcs[src.depth()](src, s, stream); - } - - template void cudaSet_(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream) - { - Scalar_ sf = s; - cudev::set(PtrStepSz(src), sf.val, mask, src.channels(), stream); - } - - void cudaSet(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*func_t)(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream); - static const func_t funcs[] = - { - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_, - cudaSet_ - }; - - funcs[src.depth()](src, s, mask, stream); - } - - void cudaCopyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) - { - cudev::copyWithMask(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } - - void cudaConvert(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, stream); - } - - void cudaConvert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) - { - cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } -} - -// NPP implementation - -namespace -{ - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - NppStreamHandler h(stream); - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - NppStreamHandler h(stream); - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - NppStreamHandler h(stream); - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - NppStreamHandler h(stream); - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - NppStreamHandler h(stream); - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - NppStreamHandler h(stream); - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyWithMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyWithMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - NppStreamHandler h(stream); - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; -} - -// Dispatcher - -namespace -{ - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_DbgAssert( src.size() == dst.size() && src.type() == dst.type() ); - - CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); - CV_Assert( src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()) ); - - if (src.depth() == CV_64F) - { - CV_Assert( deviceSupports(NATIVE_DOUBLE) ); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, - /* 8S */ {cudaCopyWithMask , cudaCopyWithMask, cudaCopyWithMask , cudaCopyWithMask }, - /* 16U */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, - /* 16S */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, - /* 32S */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, - /* 32F */ {NppCopyWithMask::call, cudaCopyWithMask, NppCopyWithMask::call, NppCopyWithMask::call}, - /* 64F */ {cudaCopyWithMask , cudaCopyWithMask, cudaCopyWithMask , cudaCopyWithMask } - }; - - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cudaCopyWithMask; - - func(src, dst, mask, stream); - } - - void convert(const GpuMat& src, GpuMat& dst, cudaStream_t stream = 0) - { - CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() ); - - CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); - CV_Assert( dst.depth() <= CV_64F ); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - CV_Assert( deviceSupports(NATIVE_DOUBLE) ); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 8U -> 16U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, - /* 8U -> 32S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 8U -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, - /* 8U -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } - }, - { - /* 8S -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 8S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 8S -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 8S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 8S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert} - }, - { - /* 16U -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, - /* 16U -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 16U -> 32S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, - /* 16U -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, - /* 16U -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } - }, - { - /* 16S -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, NppCvt::call}, - /* 16S -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 16S -> 16U */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, - /* 16S -> 32F */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert }, - /* 16S -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert } - }, - { - /* 32S -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 32S -> 8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 32S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 32S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 32S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert} - }, - { - /* 32F -> 8U */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, - /* 32F -> 8S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert}, - /* 32F -> 16U */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, - /* 32F -> 16S */ {NppCvt::call, cudaConvert, cudaConvert, cudaConvert}, - /* 32F -> 32S */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cudaConvert , cudaConvert, cudaConvert, cudaConvert} - }, - { - /* 64F -> 8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}, - /* 64F -> 64F */ {0,0,0,0} - } - }; - - const bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) - { - cudaConvert(src, dst, stream); - return; - } - - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert( func != 0 ); - - func(src, dst, stream); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() ); - - CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 ); - CV_Assert( dst.depth() <= CV_64F ); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - CV_Assert( deviceSupports(NATIVE_DOUBLE) ); - } - - cudaConvert(src, dst, alpha, beta, stream); - } - - void set(GpuMat& m, Scalar s, cudaStream_t stream = 0) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) - { - if (stream) - cudaSafeCall( cudaMemset2DAsync(m.data, m.step, 0, m.cols * m.elemSize(), m.rows, stream) ); - else - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); - return; - } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - if (stream) - cudaSafeCall( cudaMemset2DAsync(m.data, m.step, val, m.cols * m.elemSize(), m.rows, stream) ); - else - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream); - static const func_t funcs[7][4] = - { - {NppSet::call, cudaSet , cudaSet , NppSet::call}, - {NppSet::call, NppSet::call, NppSet::call, NppSet::call}, - {NppSet::call, NppSet::call, cudaSet , NppSet::call}, - {NppSet::call, NppSet::call, cudaSet , NppSet::call}, - {NppSet::call, cudaSet , cudaSet , NppSet::call}, - {NppSet::call, cudaSet , cudaSet , NppSet::call}, - {cudaSet , cudaSet , cudaSet , cudaSet } - }; - - CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 ); - - if (m.depth() == CV_64F) - { - CV_Assert( deviceSupports(NATIVE_DOUBLE) ); - } - - funcs[m.depth()][m.channels() - 1](m, s, stream); - } - - void set(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_DbgAssert( !mask.empty() ); - - CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 ); - - if (m.depth() == CV_64F) - { - CV_Assert( deviceSupports(NATIVE_DOUBLE) ); - } - - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, - {cudaSet , cudaSet, cudaSet, cudaSet }, - {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, - {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, - {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, - {NppSetMask::call, cudaSet, cudaSet, NppSetMask::call}, - {cudaSet , cudaSet, cudaSet, cudaSet } - }; - - funcs[m.depth()][m.channels() - 1](m, s, mask, stream); - } -} - -#endif // HAVE_CUDA - cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), step(step_), data((uchar*)data_), refcount(0), @@ -651,288 +153,6 @@ cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : rows = cols = 0; } -void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) -{ -#ifndef HAVE_CUDA - (void) _rows; - (void) _cols; - (void) _type; - throw_no_cuda(); -#else - _type &= Mat::TYPE_MASK; - - if (rows == _rows && cols == _cols && type() == _type && data) - return; - - if (data) - release(); - - CV_DbgAssert( _rows >= 0 && _cols >= 0 ); - - if (_rows > 0 && _cols > 0) - { - flags = Mat::MAGIC_VAL + _type; - rows = _rows; - cols = _cols; - - size_t esz = elemSize(); - - void* devPtr; - - if (rows > 1 && cols > 1) - { - cudaSafeCall( cudaMallocPitch(&devPtr, &step, esz * cols, rows) ); - } - else - { - // Single row or single column must be continuous - cudaSafeCall( cudaMalloc(&devPtr, esz * cols * rows) ); - step = esz * cols; - } - - if (esz * cols == step) - flags |= Mat::CONTINUOUS_FLAG; - - int64 _nettosize = static_cast(step) * rows; - size_t nettosize = static_cast(_nettosize); - - datastart = data = static_cast(devPtr); - dataend = data + nettosize; - - refcount = static_cast(fastMalloc(sizeof(*refcount))); - *refcount = 1; - } -#endif -} - -void cv::gpu::GpuMat::release() -{ -#ifdef HAVE_CUDA - if (refcount && CV_XADD(refcount, -1) == 1) - { - cudaFree(datastart); - fastFree(refcount); - } - - data = datastart = dataend = 0; - step = rows = cols = 0; - refcount = 0; -#endif -} - -void cv::gpu::GpuMat::upload(InputArray arr) -{ -#ifndef HAVE_CUDA - (void) arr; - throw_no_cuda(); -#else - Mat mat = arr.getMat(); - - CV_DbgAssert( !mat.empty() ); - - create(mat.size(), mat.type()); - - cudaSafeCall( cudaMemcpy2D(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) ); -#endif -} - -void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream) -{ -#ifndef HAVE_CUDA - (void) arr; - (void) _stream; - throw_no_cuda(); -#else - Mat mat = arr.getMat(); - - CV_DbgAssert( !mat.empty() ); - - create(mat.size(), mat.type()); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - cudaSafeCall( cudaMemcpy2DAsync(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice, stream) ); -#endif -} - -void cv::gpu::GpuMat::download(OutputArray _dst) const -{ -#ifndef HAVE_CUDA - (void) _dst; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - _dst.create(size(), type()); - Mat dst = _dst.getMat(); - - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) ); -#endif -} - -void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const -{ -#ifndef HAVE_CUDA - (void) _dst; - (void) _stream; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - _dst.create(size(), type()); - Mat dst = _dst.getMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost, stream) ); -#endif -} - -void cv::gpu::GpuMat::copyTo(OutputArray _dst) const -{ -#ifndef HAVE_CUDA - (void) _dst; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - _dst.create(size(), type()); - GpuMat dst = _dst.getGpuMat(); - - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); -#endif -} - -void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const -{ -#ifndef HAVE_CUDA - (void) _dst; - (void) _stream; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - _dst.create(size(), type()); - GpuMat dst = _dst.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice, stream) ); -#endif -} - -void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const -{ -#ifndef HAVE_CUDA - (void) _dst; - (void) _mask; - (void) _stream; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - _dst.create(size(), type()); - GpuMat dst = _dst.getGpuMat(); - - GpuMat mask = _mask.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - ::copyWithMask(*this, dst, mask, stream); -#endif -} - -GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream) -{ -#ifndef HAVE_CUDA - (void) s; - (void) _stream; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - ::set(*this, s, stream); -#endif - - return *this; -} - -GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream) -{ -#ifndef HAVE_CUDA - (void) s; - (void) _mask; - (void) _stream; - throw_no_cuda(); -#else - CV_DbgAssert( !empty() ); - - GpuMat mask = _mask.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - ::set(*this, s, mask, stream); -#endif - - return *this; -} - -void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const -{ -#ifndef HAVE_CUDA - (void) _dst; - (void) rtype; - (void) _stream; - throw_no_cuda(); -#else - if (rtype < 0) - rtype = type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); - - const int sdepth = depth(); - const int ddepth = CV_MAT_DEPTH(rtype); - if (sdepth == ddepth) - { - if (_stream) - copyTo(_dst, _stream); - else - copyTo(_dst); - - return; - } - - GpuMat src = *this; - - _dst.create(size(), rtype); - GpuMat dst = _dst.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - ::convert(src, dst, stream); -#endif -} - -void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const -{ -#ifndef HAVE_CUDA - (void) _dst; - (void) rtype; - (void) alpha; - (void) beta; - (void) _stream; - throw_no_cuda(); -#else - if (rtype < 0) - rtype = type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); - - GpuMat src = *this; - - _dst.create(size(), rtype); - GpuMat dst = _dst.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - ::convert(src, dst, alpha, beta, stream); -#endif -} - GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const { GpuMat hdr = *this; @@ -1124,3 +344,101 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat) return mat = GpuMat(rows, cols, type); } + +#ifndef HAVE_CUDA + +void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) +{ + (void) _rows; + (void) _cols; + (void) _type; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::release() +{ +} + +void cv::gpu::GpuMat::upload(InputArray arr) +{ + (void) arr; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream) +{ + (void) arr; + (void) _stream; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::download(OutputArray _dst) const +{ + (void) _dst; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const +{ + (void) _dst; + (void) _stream; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::copyTo(OutputArray _dst) const +{ + (void) _dst; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const +{ + (void) _dst; + (void) _stream; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const +{ + (void) _dst; + (void) _mask; + (void) _stream; + throw_no_cuda(); +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream) +{ + (void) s; + (void) _stream; + throw_no_cuda(); + return *this; +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream) +{ + (void) s; + (void) _mask; + (void) _stream; + throw_no_cuda(); + return *this; +} + +void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const +{ + (void) _dst; + (void) rtype; + (void) _stream; + throw_no_cuda(); +} + +void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const +{ + (void) _dst; + (void) rtype; + (void) alpha; + (void) beta; + (void) _stream; + throw_no_cuda(); +} + +#endif -- 2.7.4