From 2d30480982a01df3619781c6ef80148b298c68cd Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 22 Feb 2012 10:00:53 +0000 Subject: [PATCH] created wrappers for new NPP functions removed void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream) - it fails with NPP_NOT_IMPLEMENTED error updated docs, accuracy and performance tests --- modules/core/src/gpumat.cpp | 193 +++- modules/gpu/doc/image_processing.rst | 56 +- modules/gpu/doc/matrix_reductions.rst | 3 + modules/gpu/doc/operations_on_matrices.rst | 2 +- modules/gpu/doc/per_element_operations.rst | 119 +- modules/gpu/include/opencv2/gpu/gpu.hpp | 77 +- modules/gpu/perf/perf_arithm.cpp | 32 +- modules/gpu/perf/perf_imgproc.cpp | 90 +- modules/gpu/perf/perf_main.cpp | 2 +- modules/gpu/perf/perf_utility.hpp | 1 + modules/gpu/src/arithm.cpp | 127 +- modules/gpu/src/color.cpp | 16 + modules/gpu/src/cuda/bilateral_filter.cu | 2 +- modules/gpu/src/cuda/element_operations.cu | 4 +- modules/gpu/src/element_operations.cpp | 1213 ++++++++++++++++---- modules/gpu/src/filtering.cpp | 2 +- modules/gpu/src/graphcuts.cpp | 101 +- modules/gpu/src/imgproc.cpp | 161 ++- modules/gpu/src/initialization.cpp | 13 +- modules/gpu/src/matrix_reductions.cpp | 17 +- modules/gpu/src/optical_flow.cpp | 2 +- modules/gpu/src/optical_flow_farneback.cpp | 2 +- modules/gpu/src/orb.cpp | 6 +- modules/gpu/src/precomp.hpp | 4 +- modules/gpu/test/test_arithm.cpp | 185 ++- modules/gpu/test/test_imgproc.cpp | 43 + modules/gpu/test/test_main.cpp | 2 +- modules/highgui/src/window.cpp | 1 - .../opencv2/stitching/detail/seam_finders.hpp | 2 + modules/stitching/src/seam_finders.cpp | 2 + samples/cpp/point_cloud.cpp | 2 +- samples/gpu/alpha_comp.cpp | 68 ++ samples/gpu/brox_optical_flow.cpp | 2 +- samples/gpu/farneback_optical_flow.cpp | 6 +- samples/gpu/performance/performance.cpp | 2 +- samples/gpu/performance/performance.h | 6 +- samples/gpu/performance/tests.cpp | 2 +- 37 files changed, 1993 insertions(+), 575 deletions(-) create mode 100644 samples/gpu/alpha_comp.cpp diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 092c92f..8e69c18 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -48,6 +48,17 @@ #ifdef HAVE_CUDA #include #include + + #define CUDART_MINIMUM_REQUIRED_VERSION 4010 + #define NPP_MINIMUM_REQUIRED_VERSION 4100 + + #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) + #error "Insufficient Cuda Runtime library version, please update it." + #endif + + #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) + #error "Insufficient NPP version, please update it." + #endif #endif using namespace std; @@ -460,15 +471,17 @@ namespace cv { namespace gpu namespace { - ////////////////////////////////////////////////////////////////////////// - // Convert - template struct NPPTypeTraits; template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; + template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; + template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + + ////////////////////////////////////////////////////////////////////////// + // Convert template struct NppConvertFunc { @@ -494,6 +507,7 @@ namespace NppiSize sz; sz.width = src.cols; sz.height = src.rows; + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -508,6 +522,7 @@ namespace NppiSize sz; sz.width = src.cols; sz.height = src.rows; + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -529,6 +544,14 @@ namespace typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); }; + template struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template<> struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; template::func_ptr func> struct NppSet { @@ -613,6 +636,35 @@ namespace } }; + ////////////////////////////////////////////////////////////////////////// + // CopyMasked + + template struct NppCopyMaskedFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppCopyMasked + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void copyMasked(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // CudaFuncTable + class CudaFuncTable : public GpuFuncTable { public: @@ -631,7 +683,26 @@ namespace void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const { - ::cv::gpu::copyWithMask(src, dst, mask); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[7][4] = + { + /* 8U */ {NppCopyMasked::copyMasked, cv::gpu::copyWithMask, NppCopyMasked::copyMasked, NppCopyMasked::copyMasked}, + /* 8S */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask}, + /* 16U */ {NppCopyMasked::copyMasked, cv::gpu::copyWithMask, NppCopyMasked::copyMasked, NppCopyMasked::copyMasked}, + /* 16S */ {NppCopyMasked::copyMasked, cv::gpu::copyWithMask, NppCopyMasked::copyMasked, NppCopyMasked::copyMasked}, + /* 32S */ {NppCopyMasked::copyMasked, cv::gpu::copyWithMask, NppCopyMasked::copyMasked, NppCopyMasked::copyMasked}, + /* 32F */ {NppCopyMasked::copyMasked, cv::gpu::copyWithMask, NppCopyMasked::copyMasked, NppCopyMasked::copyMasked}, + /* 64F */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask} + }; + + caller_t func = mask.channels() == src.channels() ? callers[src.depth()][src.channels()] : cv::gpu::copyWithMask; + CV_DbgAssert(func != 0); + + func(src, dst, mask, 0); } void convert(const GpuMat& src, GpuMat& dst) const @@ -641,65 +712,65 @@ namespace { { /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo}, - /* 8U -> 16U */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt::cvt}, - /* 8U -> 16S */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt::cvt}, - /* 8U -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8U -> 32F */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 8U -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, + /* 8U -> 16U */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt::cvt}, + /* 8U -> 16S */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt::cvt}, + /* 8U -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8U -> 32F */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 8S -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 8S -> 8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8S -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 8S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 8S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8S -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 8S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 16U -> 8U */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt::cvt}, - /* 16U -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 16U -> 8U */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt::cvt}, + /* 16U -> 8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16U -> 32S */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16U -> 32F */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 16U -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16U -> 32S */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16U -> 32F */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 16S -> 8U */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt::cvt}, - /* 16S -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 16S -> 8U */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt::cvt}, + /* 16S -> 8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16S -> 32F */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 16S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 16S -> 32S */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16S -> 32F */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 16S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 32S -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32S -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 32S -> 8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32S -> 8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 32S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 32F -> 8U */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32F -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32F -> 16U */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32F -> 16S */ {NppCvt::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 32F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 32F -> 8U */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32F -> 8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32F -> 16U */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32F -> 16S */ {NppCvt::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 32F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo} + /* 32F -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo} }, { - /* 64F -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 64F -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 64F -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 64F -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 64F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, - /* 64F -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}, + /* 64F -> 8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 64F -> 8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 64F -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 64F -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 64F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, + /* 64F -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}, /* 64F -> 64F */ {0,0,0,0} } }; @@ -712,7 +783,7 @@ namespace void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const { - ::cv::gpu::convertTo(src, dst, alpha, beta); + cv::gpu::convertTo(src, dst, alpha, beta); } void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const @@ -744,13 +815,13 @@ namespace typedef void (*caller_t)(GpuMat& src, Scalar s); static const caller_t callers[7][4] = { - {NppSet::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet::set}, - {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}, - {NppSet::set, NppSet::set, ::cv::gpu::setTo, NppSet::set}, - {NppSet::set, NppSet::set, ::cv::gpu::setTo, NppSet::set}, - {NppSet::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet::set}, - {NppSet::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet::set}, - {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo} + {NppSet::set, cv::gpu::setTo, cv::gpu::setTo, NppSet::set}, + {NppSet::set, NppSet::set, NppSet::set, NppSet::set}, + {NppSet::set, NppSet::set, cv::gpu::setTo, NppSet::set}, + {NppSet::set, NppSet::set, cv::gpu::setTo, NppSet::set}, + {NppSet::set, cv::gpu::setTo, cv::gpu::setTo, NppSet::set}, + {NppSet::set, cv::gpu::setTo, cv::gpu::setTo, NppSet::set}, + {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo} }; callers[m.depth()][m.channels() - 1](m, s); @@ -761,13 +832,13 @@ namespace static const caller_t callers[7][4] = { - {NppSetMask::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask::set}, - {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}, - {NppSetMask::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask::set}, - {NppSetMask::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask::set}, - {NppSetMask::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask::set}, - {NppSetMask::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask::set}, - {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo} + {NppSetMask::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::set}, + {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo}, + {NppSetMask::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::set}, + {NppSetMask::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::set}, + {NppSetMask::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::set}, + {NppSetMask::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::set}, + {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo} }; callers[m.depth()][m.channels() - 1](m, s, mask); diff --git a/modules/gpu/doc/image_processing.rst b/modules/gpu/doc/image_processing.rst index 2d06948..f7b4083 100644 --- a/modules/gpu/doc/image_processing.rst +++ b/modules/gpu/doc/image_processing.rst @@ -69,18 +69,14 @@ Performs a mean-shift segmentation of the source image and eliminates small segm gpu::integral ----------------- -Computes an integral image and a squared integral image. +Computes an integral image. .. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()) -.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null()) - :param src: Source image. Only ``CV_8UC1`` images are supported for now. :param sum: Integral image containing 32-bit unsigned integer values packed into ``CV_32SC1`` . - :param sqsum: Squared integral image of the ``CV_32FC1`` type. - :param stream: Stream for the asynchronous version. .. seealso:: :ocv:func:`integral` @@ -380,6 +376,22 @@ Converts an image from one color space to another. +gpu::swapChannels +----------------- +Exchanges the color channels of an image in-place. + +.. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null()) + + :param src: Source image. Supports only ``CV_8UC4`` type. + + :param dstOrder: Integer array describing how channel values are permutated. The n-th entry of the array contains the number of the channel that is stored in the n-th channel of the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order. + + :param stream: Stream for the asynchronous version. + +The methods support arbitrary permutations of the original channels, including replication. + + + gpu::threshold ------------------ Applies a fixed-level threshold to each array element. @@ -489,7 +501,7 @@ Rotates an image around the origin (0,0) and then shifts it. .. ocv:function:: void gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()) - :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` types are supported. + :param src: Source image. Supports 1, 3 or 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32F`` depth. :param dst: Destination image with the same type as ``src`` . The size is ``dsize`` . @@ -751,6 +763,38 @@ Performs linear blending of two images. +gpu::alphaComp +------------------- +Composites two images using alpha opacity values contained in each image. + +.. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null()) + + :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types. + + :param img1: Second image. Must have the same size and the same type as ``img1`` . + + :param dst: Destination image. + + :param alpha_op: Flag specifying the alpha-blending operation: + + * **ALPHA_OVER** + * **ALPHA_IN** + * **ALPHA_OUT** + * **ALPHA_ATOP** + * **ALPHA_XOR** + * **ALPHA_PLUS** + * **ALPHA_OVER_PREMUL** + * **ALPHA_IN_PREMUL** + * **ALPHA_OUT_PREMUL** + * **ALPHA_ATOP_PREMUL** + * **ALPHA_XOR_PREMUL** + * **ALPHA_PLUS_PREMUL** + * **ALPHA_PREMUL** + + :param stream: Stream for the asynchronous version. + + + gpu::Canny ------------------- Finds edges in an image using the [Canny86]_ algorithm. diff --git a/modules/gpu/doc/matrix_reductions.rst b/modules/gpu/doc/matrix_reductions.rst index 965dce4..26e420c 100644 --- a/modules/gpu/doc/matrix_reductions.rst +++ b/modules/gpu/doc/matrix_reductions.rst @@ -10,6 +10,7 @@ gpu::meanStdDev Computes a mean value and a standard deviation of matrix elements. .. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev) +.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf); :param mtx: Source matrix. ``CV_8UC1`` matrices are supported for now. @@ -17,6 +18,8 @@ Computes a mean value and a standard deviation of matrix elements. :param stddev: Standard deviation value. + :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically. + .. seealso:: :ocv:func:`meanStdDev` diff --git a/modules/gpu/doc/operations_on_matrices.rst b/modules/gpu/doc/operations_on_matrices.rst index 3d91ab7..68d9c06 100644 --- a/modules/gpu/doc/operations_on_matrices.rst +++ b/modules/gpu/doc/operations_on_matrices.rst @@ -63,7 +63,7 @@ Flips a 2D matrix around vertical, horizontal, or both axes. .. ocv:function:: void gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream = Stream::Null()) - :param src: Source matrix. Only ``CV_8UC1`` and ``CV_8UC4`` matrices are supported for now. + :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth. :param dst: Destination matrix. diff --git a/modules/gpu/doc/per_element_operations.rst b/modules/gpu/doc/per_element_operations.rst index 527b97d..a09b7a3 100644 --- a/modules/gpu/doc/per_element_operations.rst +++ b/modules/gpu/doc/per_element_operations.rst @@ -139,45 +139,65 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha - - -gpu::exp +gpu::abs ------------ -Computes an exponent of each matrix element. +Computes an absolute value of each matrix element. -.. ocv:function:: void gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) +.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) - :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now. + :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth. :param dst: Destination matrix with the same size and type as ``src`` . :param stream: Stream for the asynchronous version. -.. seealso:: :ocv:func:`exp` +.. seealso:: :ocv:func:`abs` -gpu::pow +gpu::sqr ------------ -Raises every matrix element to a power. +Computes a square value of each matrix element. -.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null()) +.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth. + + :param dst: Destination matrix with the same size and type as ``src`` . + + :param stream: Stream for the asynchronous version. - :param src: Source matrix. Supports all type, except ``CV_64F`` depth. - :param power: Exponent of power. + +gpu::sqrt +------------ +Computes a square root of each matrix element. + +.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth. :param dst: Destination matrix with the same size and type as ``src`` . :param stream: Stream for the asynchronous version. -The function ``pow`` raises every element of the input matrix to ``p`` : +.. seealso:: :ocv:func:`sqrt` -.. math:: - \texttt{dst} (I) = \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise} -.. seealso:: :ocv:func:`pow` +gpu::exp +------------ +Computes an exponent of each matrix element. + +.. ocv:function:: void gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth. + + :param dst: Destination matrix with the same size and type as ``src`` . + + :param stream: Stream for the asynchronous version. + +.. seealso:: :ocv:func:`exp` @@ -187,7 +207,7 @@ Computes a natural logarithm of absolute value of each matrix element. .. ocv:function:: void gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) - :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now. + :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth. :param dst: Destination matrix with the same size and type as ``src`` . @@ -197,6 +217,30 @@ Computes a natural logarithm of absolute value of each matrix element. +gpu::pow +------------ +Raises every matrix element to a power. + +.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports all type, except ``CV_64F`` depth. + + :param power: Exponent of power. + + :param dst: Destination matrix with the same size and type as ``src`` . + + :param stream: Stream for the asynchronous version. + +The function ``pow`` raises every element of the input matrix to ``p`` : + +.. math:: + + \texttt{dst} (I) = \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise} + +.. seealso:: :ocv:func:`pow` + + + gpu::absdiff ---------------- Computes per-element absolute difference of two matrices (or of a matrix and scalar). @@ -262,9 +306,10 @@ Performs a per-element bitwise inversion. gpu::bitwise_or ------------------- -Performs a per-element bitwise disjunction of two matrices. +Performs a per-element bitwise disjunction of two matrices or of matrix and scalar. .. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()) +.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()) :param src1: First source matrix. @@ -280,9 +325,10 @@ Performs a per-element bitwise disjunction of two matrices. gpu::bitwise_and -------------------- -Performs a per-element bitwise conjunction of two matrices. +Performs a per-element bitwise conjunction of two matrices or of matrix and scalar. .. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()) +.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()) :param src1: First source matrix. @@ -298,9 +344,10 @@ Performs a per-element bitwise conjunction of two matrices. gpu::bitwise_xor -------------------- -Performs a per-element bitwise ``exclusive or`` operation of two matrices. +Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar. .. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()) +.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()) :param src1: First source matrix. @@ -314,6 +361,38 @@ Performs a per-element bitwise ``exclusive or`` operation of two matrices. +gpu::rshift +-------------------- +Performs pixel by pixel right shift of an image by a constant value. + +.. ocv:function:: void gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements. + + :param sc: Constant values, one per channel. + + :param dst: Destination matrix with the same size and type as ``src`` . + + :param stream: Stream for the asynchronous version. + + + +gpu::lshift +-------------------- +Performs pixel by pixel right left of an image by a constant value. + +.. ocv:function:: void gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()) + + :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth. + + :param sc: Constant values, one per channel. + + :param dst: Destination matrix with the same size and type as ``src`` . + + :param stream: Stream for the asynchronous version. + + + gpu::min ------------ Computes the per-element minimum of two matrices (or a matrix and a scalar). diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index f65c087..b389d6f 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -498,7 +498,7 @@ CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null()); //! reverses the order of the rows, columns or both in a matrix -//! supports CV_8UC1, CV_8UC4 types +//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null()); //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) @@ -586,20 +586,32 @@ CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& str //! computes element-wise absolute difference of array and scalar (c = abs(a - s)) CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); +//! computes absolute value of each matrix element +//! supports CV_16S and CV_32F depth +CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); + +//! computes square of each pixel in an image +//! supports CV_8U, CV_16U, CV_16S and CV_32F depth +CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); + +//! computes square root of each pixel in an image +//! supports CV_8U, CV_16U, CV_16S and CV_32F depth +CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); + //! computes exponent of each matrix element (b = e**a) -//! supports only CV_32FC1 type +//! supports CV_8U, CV_16U, CV_16S and CV_32F depth CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); +//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) +//! supports CV_8U, CV_16U, CV_16S and CV_32F depth +CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); + //! computes power of each matrix element: // (dst(i,j) = pow( src(i,j) , power), if src.type() is integer // (dst(i,j) = pow(fabs(src(i,j)), power), otherwise //! supports all, except depth == CV_64F CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null()); -//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) -//! supports only CV_32FC1 type -CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); - //! compares elements of two arrays (c = a b) CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null()); @@ -608,12 +620,29 @@ CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=G //! calculates per-element bit-wise disjunction of two arrays CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! calculates per-element bit-wise disjunction of array and scalar +//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth +CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()); //! calculates per-element bit-wise conjunction of two arrays CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! calculates per-element bit-wise conjunction of array and scalar +//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth +CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()); //! calculates per-element bit-wise "exclusive or" operation CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! calculates per-element bit-wise "exclusive or" of array and scalar +//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth +CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()); + +//! pixel by pixel right shift of an image by a constant value +//! supports 1, 3 and 4 channels images with integers elements +CV_EXPORTS void rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()); + +//! pixel by pixel left shift of an image by a constant value +//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth +CV_EXPORTS void lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null()); //! computes per-element minimum of two arrays (dst = min(src1, src2)) CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); @@ -627,6 +656,13 @@ CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& //! computes per-element maximum of array and scalar (dst = max(src1, src2)) CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); +enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL, + ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL}; + +//! Composite two images using alpha opacity values contained in each image +//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types +CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null()); + ////////////////////////////// Image processing ////////////////////////////// @@ -665,6 +701,13 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& //! converts image from one color space to another CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null()); +//! swap channels +//! dstOrder - Integer array describing how channel values are permutated. The n-th entry +//! of the array contains the number of the channel that is stored in the n-th channel of +//! the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR +//! channel order. +CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null()); + //! applies fixed threshold to the image CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null()); @@ -692,9 +735,9 @@ CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); -//! rotate 8bit single or four channel image -//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC -//! supports CV_8UC1, CV_8UC4 types +//! rotates an image around the origin (0,0) and then shifts it +//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC +//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); @@ -706,15 +749,9 @@ CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bott //! sum will have CV_32S type, but will contain unsigned int values //! supports only CV_8UC1 source type CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()); - //! buffered version CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null()); -//! computes the integral image and integral for the squared image -//! sum will have CV_32S type, sqsum - CV32F type -//! supports only CV_8UC1 source type -CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null()); - //! computes squared integral image //! result matrix will have 64F type, but will contain 64U values //! supports source images of 8UC1 type only @@ -859,6 +896,8 @@ private: //! computes mean value and standard deviation of all or selected array elements //! supports only CV_8UC1 type CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); +//! buffered version +CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf); //! computes norm of array //! supports NORM_INF, NORM_L1, NORM_L2 @@ -939,10 +978,16 @@ CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& c //////////////////////////////// Image Labeling //////////////////////////////// -//!performs labeling via graph cuts +//!performs labeling via graph cuts of a 2D regular 4-connected graph. CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& stream = Stream::Null()); +//!performs labeling via graph cuts of a 2D regular 8-connected graph. +CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, + GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, + GpuMat& labels, + GpuMat& buf, Stream& stream = Stream::Null()); + ////////////////////////////////// Histograms ////////////////////////////////// //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type. diff --git a/modules/gpu/perf/perf_arithm.cpp b/modules/gpu/perf/perf_arithm.cpp index d9a2d02..2ab720c 100644 --- a/modules/gpu/perf/perf_arithm.cpp +++ b/modules/gpu/perf/perf_arithm.cpp @@ -59,7 +59,7 @@ GPU_PERF_TEST(Flip, cv::gpu::DeviceInfo, cv::Size, perf::MatType, FlipCode) INSTANTIATE_TEST_CASE_P(Arithm, Flip, testing::Combine( ALL_DEVICES, GPU_TYPICAL_MAT_SIZES, - testing::Values(CV_8UC1, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), testing::Values((int) HORIZONTAL_AXIS, (int) VERTICAL_AXIS, (int) BOTH_AXIS))); ////////////////////////////////////////////////////////////////////// @@ -363,6 +363,33 @@ INSTANTIATE_TEST_CASE_P(Arithm, BitwiseAnd, testing::Combine( GPU_TYPICAL_MAT_SIZES, testing::Values(CV_8UC1, CV_16UC1, CV_32SC1))); +GPU_PERF_TEST(BitwiseScalarAnd, cv::gpu::DeviceInfo, cv::Size, perf::MatType) +{ + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::Size size = GET_PARAM(1); + int type = GET_PARAM(2); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::Mat src_host(size, type); + + declare.in(src_host, WARMUP_RNG); + + cv::gpu::GpuMat src(src_host); + cv::gpu::GpuMat dst; + cv::Scalar sc = cv::Scalar(123, 123, 123, 123); + + TEST_CYCLE() + { + cv::gpu::bitwise_and(src, sc, dst); + } +} + +INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalarAnd, testing::Combine( + ALL_DEVICES, + GPU_TYPICAL_MAT_SIZES, + testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4))); + ////////////////////////////////////////////////////////////////////// // Min @@ -411,10 +438,11 @@ GPU_PERF_TEST(MeanStdDev, cv::gpu::DeviceInfo, cv::Size) cv::gpu::GpuMat src(src_host); cv::Scalar mean; cv::Scalar stddev; + cv::gpu::GpuMat buf; TEST_CYCLE() { - cv::gpu::meanStdDev(src, mean, stddev); + cv::gpu::meanStdDev(src, mean, stddev, buf); } } diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index e2138a9..1418d09 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -201,7 +201,7 @@ GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, perf::MatType, CvtColorIn declare.in(src_host, WARMUP_RNG); cv::gpu::GpuMat src(src_host); - cv::gpu::GpuMat dst(size, CV_MAKETYPE(type, info.dcn)); + cv::gpu::GpuMat dst; TEST_CYCLE() { @@ -219,6 +219,32 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine( CvtColorInfo(4, 4, cv::COLOR_BGR2HSV), CvtColorInfo(4, 4, cv::COLOR_HSV2BGR)))); ////////////////////////////////////////////////////////////////////// +// SwapChannels + +GPU_PERF_TEST(SwapChannels, cv::gpu::DeviceInfo, cv::Size) +{ + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::Size size = GET_PARAM(1); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::Mat src_host(size, CV_8UC4); + + declare.in(src_host, WARMUP_RNG); + + cv::gpu::GpuMat src(src_host); + + const int dstOrder[] = {2, 1, 0, 3}; + + TEST_CYCLE() + { + cv::gpu::swapChannels(src, dstOrder); + } +} + +INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, testing::Combine(ALL_DEVICES, GPU_TYPICAL_MAT_SIZES)); + +////////////////////////////////////////////////////////////////////// // Threshold GPU_PERF_TEST(Threshold, cv::gpu::DeviceInfo, cv::Size, perf::MatType) @@ -457,7 +483,7 @@ GPU_PERF_TEST(Rotate, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolatio INSTANTIATE_TEST_CASE_P(ImgProc, Rotate, testing::Combine( ALL_DEVICES, GPU_TYPICAL_MAT_SIZES, - testing::Values(CV_8UC1, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4), testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC))); ////////////////////////////////////////////////////////////////////// @@ -520,33 +546,6 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine( GPU_TYPICAL_MAT_SIZES)); ////////////////////////////////////////////////////////////////////// -// IntegralBoth - -GPU_PERF_TEST(IntegralBoth, cv::gpu::DeviceInfo, cv::Size) -{ - cv::gpu::DeviceInfo devInfo = GET_PARAM(0); - cv::Size size = GET_PARAM(1); - - cv::gpu::setDevice(devInfo.deviceID()); - - cv::Mat src_host(size, CV_8UC1); - - declare.in(src_host, WARMUP_RNG); - - cv::gpu::GpuMat src(src_host); - cv::gpu::GpuMat sum, sqsum; - - TEST_CYCLE() - { - cv::gpu::integral(src, sum, sqsum); - } -} - -INSTANTIATE_TEST_CASE_P(ImgProc, IntegralBoth, testing::Combine( - ALL_DEVICES, - GPU_TYPICAL_MAT_SIZES)); - -////////////////////////////////////////////////////////////////////// // IntegralSqr GPU_PERF_TEST(IntegralSqr, cv::gpu::DeviceInfo, cv::Size) @@ -850,6 +849,39 @@ INSTANTIATE_TEST_CASE_P(ImgProc, BlendLinear, testing::Combine( testing::Values(CV_8UC1, CV_32FC1))); ////////////////////////////////////////////////////////////////////// +// AlphaComp + +GPU_PERF_TEST(AlphaComp, cv::gpu::DeviceInfo, cv::Size, perf::MatType, AlphaOp) +{ + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::Size size = GET_PARAM(1); + int type = GET_PARAM(2); + int alpha_op = GET_PARAM(3); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::Mat img1_host(size, type); + cv::Mat img2_host(size, type); + + declare.in(img1_host, img2_host, WARMUP_RNG); + + cv::gpu::GpuMat img1(img1_host); + cv::gpu::GpuMat img2(img2_host); + cv::gpu::GpuMat dst; + + TEST_CYCLE() + { + cv::gpu::alphaComp(img1, img2, dst, alpha_op); + } +} + +INSTANTIATE_TEST_CASE_P(ImgProc, AlphaComp, testing::Combine( + ALL_DEVICES, + GPU_TYPICAL_MAT_SIZES, + testing::Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4), + testing::Values((int)cv::gpu::ALPHA_OVER, (int)cv::gpu::ALPHA_IN, (int)cv::gpu::ALPHA_OUT, (int)cv::gpu::ALPHA_ATOP, (int)cv::gpu::ALPHA_XOR, (int)cv::gpu::ALPHA_PLUS, (int)cv::gpu::ALPHA_OVER_PREMUL, (int)cv::gpu::ALPHA_IN_PREMUL, (int)cv::gpu::ALPHA_OUT_PREMUL, (int)cv::gpu::ALPHA_ATOP_PREMUL, (int)cv::gpu::ALPHA_XOR_PREMUL, (int)cv::gpu::ALPHA_PLUS_PREMUL, (int)cv::gpu::ALPHA_PREMUL))); + +////////////////////////////////////////////////////////////////////// // Canny GPU_PERF_TEST_1(Canny, cv::gpu::DeviceInfo) diff --git a/modules/gpu/perf/perf_main.cpp b/modules/gpu/perf/perf_main.cpp index e667c97..0cd4002 100644 --- a/modules/gpu/perf/perf_main.cpp +++ b/modules/gpu/perf/perf_main.cpp @@ -11,7 +11,7 @@ int main(int argc, char **argv) #else -int main(int argc, char** argv) +int main() { printf("OpenCV was built without CUDA support\n"); return 0; diff --git a/modules/gpu/perf/perf_utility.hpp b/modules/gpu/perf/perf_utility.hpp index f0d6f1e..f15610b 100644 --- a/modules/gpu/perf/perf_utility.hpp +++ b/modules/gpu/perf/perf_utility.hpp @@ -11,6 +11,7 @@ CV_ENUM(FlipCode, HORIZONTAL_AXIS, VERTICAL_AXIS, BOTH_AXIS) CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC) CV_ENUM(MatchMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED) CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2) +CV_ENUM(AlphaOp, cv::gpu::ALPHA_OVER, cv::gpu::ALPHA_IN, cv::gpu::ALPHA_OUT, cv::gpu::ALPHA_ATOP, cv::gpu::ALPHA_XOR, cv::gpu::ALPHA_PLUS, cv::gpu::ALPHA_OVER_PREMUL, cv::gpu::ALPHA_IN_PREMUL, cv::gpu::ALPHA_OUT_PREMUL, cv::gpu::ALPHA_ATOP_PREMUL, cv::gpu::ALPHA_XOR_PREMUL, cv::gpu::ALPHA_PLUS_PREMUL, cv::gpu::ALPHA_PREMUL) struct CvtColorInfo { diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 1f40156..76e2621 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -52,8 +52,6 @@ void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } @@ -89,9 +87,9 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type())); - bool tr1 = flags & GEMM_1_T; - bool tr2 = flags & GEMM_2_T; - bool tr3 = flags & GEMM_3_T; + bool tr1 = (flags & GEMM_1_T) != 0; + bool tr2 = (flags & GEMM_2_T) != 0; + bool tr3 = (flags & GEMM_3_T) != 0; Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size(); Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size(); @@ -243,35 +241,66 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) //////////////////////////////////////////////////////////////////////// // flip -void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s) +namespace { - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; }; + + template struct NppMirrorFunc + { + typedef typename NppTypeTraits::npp_t npp_t; - dst.create( src.size(), src.type() ); + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip); + }; - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + template ::func_t func> struct NppMirror + { + typedef typename NppMirrorFunc::npp_t npp_t; - cudaStream_t stream = StreamAccessor::getStream(s); + static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream) + { + NppStreamHandler h(stream); - NppStreamHandler h(stream); + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; - if (src.type() == CV_8UC1) - { - nppSafeCall( nppiMirror_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, - (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); - } - else + nppSafeCall( func(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, + (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream); + + static const func_t funcs[6][4] = { - nppSafeCall( nppiMirror_8u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, - (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); - } + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {0,0,0,0}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {0,0,0,0}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call} + }; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -341,52 +370,6 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) } //////////////////////////////////////////////////////////////////////// -// exp - -void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s) -{ - CV_Assert(src.type() == CV_32FC1); - - dst.create(src.size(), src.type()); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStreamHandler h(stream); - - nppSafeCall( nppiExp_32f_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} - -//////////////////////////////////////////////////////////////////////// -// log - -void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s) -{ - CV_Assert(src.type() == CV_32FC1); - - dst.create(src.size(), src.type()); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStreamHandler h(stream); - - nppSafeCall( nppiLn_32f_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} - -//////////////////////////////////////////////////////////////////////// // NPP magnitide namespace diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp index 23266b8..89b44b9 100644 --- a/modules/gpu/src/color.cpp +++ b/modules/gpu/src/color.cpp @@ -48,6 +48,7 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -1423,4 +1424,19 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream func(src, dst, dcn, stream); } +void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s) +{ + CV_Assert(image.type() == CV_8UC4); + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = image.cols; + sz.height = image.rows; + + nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr(), static_cast(image.step), sz, dstOrder) ); +} + #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu index 8ba5748..b6d13e1 100644 --- a/modules/gpu/src/cuda/bilateral_filter.cu +++ b/modules/gpu/src/cuda/bilateral_filter.cu @@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "bilateral_filter_caller"); } - if (stream != 0) + if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 8d19955..27136d5 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -1104,9 +1104,9 @@ namespace cv { namespace gpu { namespace device cv::gpu::device::transform((DevMem2D_)src1, (DevMem2D_)dst, op, WithOutMask(), stream); } - template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); + //template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); - template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); + //template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); //template void absdiff_gpu(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream); diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 230a0f8..e1f1eb0 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -58,17 +58,28 @@ void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream& void cv::gpu::divide(double, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::abs(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::sqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_and(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::bitwise_xor(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::rshift(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::lshift(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); } double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;} void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::alphaComp(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, GpuMat&, int, Stream&) { throw_nogpu(); } #else @@ -79,20 +90,33 @@ void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, namespace { typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI); + typedef NppStatus (*npp_arithm_16u_t)(const Npp16u* pSrc1, int nSrc1Step, const Npp16u* pSrc2, int nSrc2Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + typedef NppStatus (*npp_arithm_16s_t)(const Npp16s* pSrc1, int nSrc1Step, const Npp16s* pSrc2, int nSrc2Step, Npp16s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); - void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, + bool nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4, - npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1, cudaStream_t stream) + npp_arithm_16u_t npp_func_16uc1, npp_arithm_16u_t npp_func_16uc4, + npp_arithm_16s_t npp_func_16sc1, npp_arithm_16s_t npp_func_16sc4, + npp_arithm_32s_t npp_func_32sc1, + npp_arithm_32f_t npp_func_32fc1, npp_arithm_32f_t npp_func_32fc4, + cudaStream_t stream) { + bool useNpp = (src1.depth() == CV_8U || src1.depth() == CV_16U || src1.depth() == CV_16S || src1.depth() == CV_32S || src1.depth() == CV_32F); + + if (!useNpp) + return false; + + bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16); + NppiSize sz; sz.width = src1.cols * src1.channels(); sz.height = src1.rows; NppStreamHandler h(stream); - if (src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) + if (aligned && src1.depth() == CV_8U && (sz.width % 4) == 0) { sz.width /= 4; @@ -104,12 +128,43 @@ namespace nppSafeCall( npp_func_8uc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), dst.ptr(), static_cast(dst.step), sz, 0) ); } + else if (aligned && src1.depth() == CV_16U && (sz.width % 4) == 0) + { + sz.width /= 4; + + nppSafeCall( npp_func_16uc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz, 0) ); + } + else if (src1.depth() == CV_16U) + { + nppSafeCall( npp_func_16uc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz, 0) ); + } + else if (aligned && src1.depth() == CV_16S && (sz.width % 4) == 0) + { + sz.width /= 4; + + nppSafeCall( npp_func_16sc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz, 0) ); + } + else if (src1.depth() == CV_16S) + { + nppSafeCall( npp_func_16sc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz, 0) ); + } else if (src1.depth() == CV_32S) { nppSafeCall( npp_func_32sc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + dst.ptr(), static_cast(dst.step), sz, 0) ); } - else if (src1.depth() == CV_32F) + else if (aligned && src1.depth() == CV_32F && (sz.width % 4) == 0) + { + sz.width /= 4; + + nppSafeCall( npp_func_32fc4(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + } + else // if (src1.depth() == CV_32F) { nppSafeCall( npp_func_32fc1(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), dst.ptr(), static_cast(dst.step), sz) ); @@ -117,6 +172,8 @@ namespace if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); + + return true; } } @@ -159,16 +216,18 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu cudaStream_t stream = StreamAccessor::getStream(s); - bool useNpp = - mask.empty() && - dst.type() == src1.type() && - (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && - (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - - if (useNpp) + if (mask.empty() && dst.type() == src1.type()) { - nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, stream); - return; + if (nppArithmCaller(src1, src2, dst, + nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, + nppiAdd_16u_C1RSfs, nppiAdd_16u_C4RSfs, + nppiAdd_16s_C1RSfs, nppiAdd_16s_C4RSfs, + nppiAdd_32s_C1RSfs, + nppiAdd_32f_C1R, nppiAdd_32f_C4R, + stream)) + { + return; + } } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -177,6 +236,169 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream); } +namespace +{ + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; + + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants, + npp_complex_type* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32fc* pSrc1, int nSrc1Step, const Npp32fc pConstants, Npp32fc* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const npp_t pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), saturate_cast(sc.val[0]), dst.ptr(), static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + npp_complex_type nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func(src.ptr(), static_cast(src.step), nConstant, + dst.ptr(), static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const Npp32f pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), saturate_cast(sc.val[0]), dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + static void call(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Npp32fc nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func(src.ptr(), static_cast(src.step), nConstant, dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) { using namespace ::cv::gpu::device; @@ -194,7 +416,18 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat {0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, 0/*add_gpu*/, add_gpu} }; - CV_Assert(src.channels() == 1 || src.type() == CV_32FC2); + typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0} + }; + CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U)); if (dtype < 0) @@ -204,34 +437,19 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src.type() && src.depth() == CV_32F) + if (mask.empty() && dst.type() == src.type()) { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - NppStreamHandler h(stream); - - if (src.type() == CV_32FC1) - { - nppSafeCall( nppiAddC_32f_C1R(src.ptr(), static_cast(src.step), static_cast(sc.val[0]), - dst.ptr(), static_cast(dst.step), sz) ); - } - else + if (npp_func) { - Npp32fc nValue; - nValue.re = static_cast(sc.val[0]); - nValue.im = static_cast(sc.val[1]); - nppSafeCall( nppiAddC_32fc_C1R(src.ptr(), static_cast(src.step), nValue, - dst.ptr(), static_cast(dst.step), sz) ); + npp_func(src, sc, dst, stream); + return; } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - return; } + CV_Assert(src.channels() == 1); + const func_t func = funcs[src.depth()][dst.depth()]; CV_Assert(func != 0); @@ -277,16 +495,18 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons cudaStream_t stream = StreamAccessor::getStream(s); - bool useNpp = - mask.empty() && - dst.type() == src1.type() && - (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && - (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - - if (useNpp) + if (mask.empty() && dst.type() == src1.type()) { - nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, stream); - return; + if (nppArithmCaller(src2, src1, dst, + nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, + nppiSub_16u_C1RSfs, nppiSub_16u_C4RSfs, + nppiSub_16s_C1RSfs, nppiSub_16s_C4RSfs, + nppiSub_32s_C1RSfs, + nppiSub_32f_C1R, nppiSub_32f_C4R, + stream)) + { + return; + } } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -312,7 +532,18 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G {0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, 0/*subtract_gpu*/, subtract_gpu} }; - CV_Assert(src.channels() == 1 || src.type() == CV_32FC2); + typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0} + }; + CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U)); if (dtype < 0) @@ -322,34 +553,19 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src.type() && src.depth() == CV_32F) + if (mask.empty() && dst.type() == src.type()) { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - NppStreamHandler h(stream); - - if (src.type() == CV_32FC1) - { - nppSafeCall( nppiSubC_32f_C1R(src.ptr(), static_cast(src.step), static_cast(sc.val[0]), - dst.ptr(), static_cast(dst.step), sz) ); - } - else + if (npp_func) { - Npp32fc nValue; - nValue.re = static_cast(sc.val[0]); - nValue.im = static_cast(sc.val[1]); - nppSafeCall( nppiSubC_32fc_C1R(src.ptr(), static_cast(src.step), nValue, - dst.ptr(), static_cast(dst.step), sz) ); + npp_func(src, sc, dst, stream); + return; } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - return; } + CV_Assert(src.channels() == 1); + const func_t func = funcs[src.depth()][dst.depth()]; CV_Assert(func != 0); @@ -415,16 +631,18 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); - bool useNpp = - scale == 1 && - dst.type() == src1.type() && - (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && - (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - - if (useNpp) + if (scale == 1 && dst.type() == src1.type()) { - nppArithmCaller(src2, src1, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, stream); - return; + if (nppArithmCaller(src1, src2, dst, + nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, + nppiMul_16u_C1RSfs, nppiMul_16u_C4RSfs, + nppiMul_16s_C1RSfs, nppiMul_16s_C4RSfs, + nppiMul_32s_C1RSfs, + nppiMul_32f_C1R, nppiMul_32f_C4R, + stream)) + { + return; + } } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -434,6 +652,16 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub } } +namespace +{ + bool isIntScalar(Scalar sc) + { + Scalar_ isc(sc); + + return sc.val[0] == isc.val[0] && sc.val[1] == isc.val[1] && sc.val[2] == isc.val[2] && sc.val[3] == isc.val[3]; + } +} + void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s) { using namespace ::cv::gpu::device; @@ -451,7 +679,17 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double {0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, multiply_gpu} }; - //CV_Assert(src.channels() == 1); + typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, 0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0} + }; if (dtype < 0) dtype = src.depth(); @@ -460,24 +698,19 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double cudaStream_t stream = StreamAccessor::getStream(s); - if (dst.type() == src.type() && src.type() == CV_32FC1 && scale == 1) + if (dst.type() == src.type() && scale == 1) { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - NppStreamHandler h(stream); - - nppSafeCall( nppiMulC_32f_C1R(src.ptr(), static_cast(src.step), static_cast(sc.val[0]), - dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - return; + if (npp_func && (src.depth() == CV_32F || isIntScalar(sc))) + { + npp_func(src, sc, dst, stream); + return; + } } const func_t func = funcs[src.depth()][dst.depth()]; + CV_Assert(func != 0); func(src.reshape(1), sc.val[0], dst.reshape(1), scale, stream); @@ -545,16 +778,18 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); - bool useNpp = - scale == 1 && - dst.type() == src1.type() && - (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) && - (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16)); - - if (useNpp) + if (scale == 1 && dst.type() == src1.type()) { - nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, stream); - return; + if (nppArithmCaller(src2, src1, dst, + nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, + nppiDiv_16u_C1RSfs, nppiDiv_16u_C4RSfs, + nppiDiv_16s_C1RSfs, nppiDiv_16s_C4RSfs, + nppiDiv_32s_C1RSfs, + nppiDiv_32f_C1R, nppiDiv_32f_C4R, + stream)) + { + return; + } } const func_t func = funcs[src1.depth()][dst.depth()]; @@ -581,7 +816,17 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc {0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, 0/*divide_gpu*/, divide_gpu} }; - CV_Assert(src.channels() == 1); + typedef void (*npp_func_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, 0}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0,0,0,0} + }; if (dtype < 0) dtype = src.depth(); @@ -590,27 +835,22 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc cudaStream_t stream = StreamAccessor::getStream(s); - if (dst.type() == src.type() && src.type() == CV_32FC1 && scale == 1) + if (dst.type() == src.type() && scale == 1) { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - NppStreamHandler h(stream); + const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - nppSafeCall( nppiDivC_32f_C1R(src.ptr(), static_cast(src.step), static_cast(sc.val[0]), - dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - return; + if (npp_func && (src.depth() == CV_32F || isIntScalar(sc))) + { + npp_func(src, sc, dst, stream); + return; + } } const func_t func = funcs[src.depth()][dst.depth()]; + CV_Assert(func != 0); - func(src, sc.val[0], dst, scale, stream); + func(src.reshape(1), sc.val[0], dst.reshape(1), scale, stream); } void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s) @@ -639,147 +879,413 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St cudaStream_t stream = StreamAccessor::getStream(s); - const func_t func = funcs[src.depth()][dst.depth()]; - CV_Assert(func != 0); + const func_t func = funcs[src.depth()][dst.depth()]; + CV_Assert(func != 0); + + func(scale, src, dst, stream); +} + +////////////////////////////////////////////////////////////////////////////// +// absdiff + +namespace cv { namespace gpu { namespace device +{ + template + void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + + template + void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); +}}} + +void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) +{ + using namespace ::cv::gpu::device; + + typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + + static const func_t funcs[] = + { + absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu + }; + + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + + dst.create( src1.size(), src1.type() ); + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppiSize sz; + sz.width = src1.cols * src1.channels(); + sz.height = src1.rows; + + if (src1.depth() == CV_8U) + { + NppStreamHandler h(stream); + + nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + else if (src1.depth() == CV_16U) + { + NppStreamHandler h(stream); + + nppSafeCall( nppiAbsDiff_16u_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + else if (src1.depth() == CV_32F) + { + NppStreamHandler h(stream); + + nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + else + { + const func_t func = funcs[src1.depth()]; + CV_Assert(func != 0); + + func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); + } +} + +namespace +{ + template struct NppAbsDiffCFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, npp_t nConstant); + }; + template <> struct NppAbsDiffCFunc + { + typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant); + }; + + template ::func_t func> struct NppAbsDiffC + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src1.cols; + sz.height = src1.rows; + + nppSafeCall( func((const npp_t*)src1.data, static_cast(src1.step), (npp_t*)dst.data, static_cast(dst.step), + sz, static_cast(val)) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s) +{ + using namespace cv::gpu::device; + + typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); + + static const func_t funcs[] = + { + NppAbsDiffC::call, + absdiff_gpu, + NppAbsDiffC::call, + absdiff_gpu, + absdiff_gpu, + NppAbsDiffC::call, + absdiff_gpu + }; + + CV_Assert(src1.channels() == 1); + + dst.create(src1.size(), src1.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + funcs[src1.depth()](src1, src2.val[0], dst, stream); +} + +////////////////////////////////////////////////////////////////////////////// +// abs + +void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& s) +{ + CV_Assert(src.depth() == CV_16S || src.depth() == CV_32F); + + dst.create(src.size(), src.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols * src.channels(); + oSizeROI.height = src.rows; + + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + + if (src.depth() == CV_16S) + { + if (aligned && oSizeROI.width % 4 == 0) + { + oSizeROI.width /= 4; + nppSafeCall( nppiAbs_16s_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } + else + { + nppSafeCall( nppiAbs_16s_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } + } + else + { + if (aligned && oSizeROI.width % 4 == 0) + { + oSizeROI.width /= 4; + nppSafeCall( nppiAbs_32f_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } + else + { + nppSafeCall( nppiAbs_32f_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + +////////////////////////////////////////////////////////////////////////////// +// sqr + +namespace +{ + template struct NppSqrFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template <> struct NppSqrFunc + { + typedef NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template ::func_t func, typename NppSqrFunc::func_t func_c4> struct NppSqr + { + typedef typename NppSqrFunc::npp_t npp_t; + + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols * src.channels(); + oSizeROI.height = src.rows; + + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + + if (aligned && oSizeROI.width % 4 == 0) + { + oSizeROI.width /= 4; + nppSafeCall( func_c4(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI, 0) ); + } + else + { + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI, 0) ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template ::func_t func, typename NppSqrFunc::func_t func_c4> struct NppSqr + { + typedef NppSqrFunc::npp_t npp_t; + + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); - func(scale, src, dst, stream); -} + NppiSize oSizeROI; + oSizeROI.width = src.cols * src.channels(); + oSizeROI.height = src.rows; -////////////////////////////////////////////////////////////////////////////// -// absdiff + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); -namespace cv { namespace gpu { namespace device -{ - template - void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + if (aligned && oSizeROI.width % 4 == 0) + { + oSizeROI.width /= 4; + nppSafeCall( func_c4(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } + else + { + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + } - template - void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); -}}} + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} -void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) +void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream) { - using namespace ::cv::gpu::device; - - typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { - absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu + NppSqr::call, + 0, + NppSqr::call, + NppSqr::call, + 0, + NppSqr::call }; - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - - dst.create( src1.size(), src1.type() ); + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); - cudaStream_t stream = StreamAccessor::getStream(s); + dst.create(src.size(), src.type()); - NppiSize sz; - sz.width = src1.cols * src1.channels(); - sz.height = src1.rows; + funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); +} - bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16); +////////////////////////////////////////////////////////////////////////////// +// sqrt -#if CUDART_VERSION == 4000 - if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) +namespace +{ + template struct NppOneSourceFunc { - NppStreamHandler h(stream); + typedef typename NppTypeTraits::npp_t npp_t; - sz.width /= 4; + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template <> struct NppOneSourceFunc + { + typedef NppTypeTraits::npp_t npp_t; - nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - else -#endif + template ::func_t func> struct NppOneSource { - if (aligned && src1.depth() == CV_8U) - { - NppStreamHandler h(stream); - - nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + typedef typename NppOneSourceFunc::npp_t npp_t; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } -#if CUDART_VERSION == 4000 - else if (aligned && src1.depth() == CV_32S) + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) { NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + NppiSize oSizeROI; + oSizeROI.width = src.cols * src.channels(); + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI, 0) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } -#endif - else if (aligned && src1.depth() == CV_32F) + }; + template ::func_t func> struct NppOneSource + { + typedef NppOneSourceFunc::npp_t npp_t; + + static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) { NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + NppiSize oSizeROI; + oSizeROI.width = src.cols * src.channels(); + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else - { - const func_t func = funcs[src1.depth()]; - CV_Assert(func != 0); - - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); - } - } + }; } -void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s) +void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream) { - using namespace ::cv::gpu::device; - - typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { - absdiff_gpu, absdiff_gpu, absdiff_gpu, absdiff_gpu,absdiff_gpu, 0/*absdiff_gpu*/, absdiff_gpu + NppOneSource::call, + 0, + NppOneSource::call, + NppOneSource::call, + 0, + NppOneSource::call }; - CV_Assert(src1.channels() == 1); + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); - dst.create(src1.size(), src1.type()); + dst.create(src.size(), src.type()); - cudaStream_t stream = StreamAccessor::getStream(s); + funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// log + +void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); - if (src1.type() == CV_32FC1) + static const func_t funcs[] = { - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; + NppOneSource::call, + 0, + NppOneSource::call, + NppOneSource::call, + 0, + NppOneSource::call + }; - cudaStream_t stream = StreamAccessor::getStream(s); + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); - NppStreamHandler h(stream); + dst.create(src.size(), src.type()); - nppSafeCall( nppiAbsDiffC_32f_C1R(src1.ptr(), static_cast(src1.step), - dst.ptr(), static_cast(dst.step), sz, static_cast(src2.val[0])) ); + funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); +} - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); +//////////////////////////////////////////////////////////////////////// +// exp - return; - } +void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); - const func_t func = funcs[src1.depth()]; - CV_Assert(func != 0); + static const func_t funcs[] = + { + NppOneSource::call, + 0, + NppOneSource::call, + NppOneSource::call, + 0, + NppOneSource::call + }; - func(src1, src2.val[0], dst, stream); -} + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); + dst.create(src.size(), src.type()); + + funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); +} ////////////////////////////////////////////////////////////////////////////// // Comparison of two matrixes @@ -1038,7 +1544,6 @@ namespace } } - void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) @@ -1047,7 +1552,6 @@ void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, co bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); } - void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) @@ -1056,7 +1560,6 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); } - void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) { if (mask.empty()) @@ -1065,6 +1568,219 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); } +namespace +{ + template struct NppBitwiseCFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppBitwiseCFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstant, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template ::func_t func> struct NppBitwiseC + { + typedef typename NppBitwiseCFunc::npp_t npp_t; + + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + const npp_t pConstants[] = {static_cast(sc.val[0]), static_cast(sc.val[1]), static_cast(sc.val[2]), static_cast(sc.val[3])}; + + nppSafeCall( func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template ::func_t func> struct NppBitwiseC + { + typedef typename NppBitwiseCFunc::npp_t npp_t; + + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), static_cast(sc.val[0]), dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); + + static const func_t funcs[5][4] = + { + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call} + }; + + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); + + static const func_t funcs[5][4] = + { + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call} + }; + + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); + + static const func_t funcs[5][4] = + { + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {NppBitwiseC::call, 0, NppBitwiseC::call, NppBitwiseC::call} + }; + + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// shift + +namespace +{ + template struct NppShiftFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppShiftFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template ::func_t func> struct NppShift + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val, dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template ::func_t func> struct NppShift + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val[0], dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[5][4] = + { + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, + }; + + CV_Assert(src.depth() < CV_32F); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[5][4] = + { + {NppShift::call , 0, NppShift::call , NppShift::call }, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + }; + + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + + dst.create(src.size(), src.type()); + + funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} ////////////////////////////////////////////////////////////////////////////// // Minimum and maximum operations @@ -1187,7 +1903,7 @@ namespace { template void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream) { - ::cv::gpu::device::threshold_gpu(src, dst, saturate_cast(thresh), saturate_cast(maxVal), type, stream); + cv::gpu::device::threshold_gpu(src, dst, saturate_cast(thresh), saturate_cast(maxVal), type, stream); } } @@ -1216,11 +1932,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double } else { - CV_Assert((src.depth() != CV_64F) || - (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); - - typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, - cudaStream_t stream); + typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream); static const caller_t callers[] = { @@ -1266,7 +1978,82 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream) pow_caller, pow_caller }; - callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream)); + callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// alphaComp + +namespace +{ + template struct NppAlphaCompFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp); + }; + + template ::func_t func> struct NppAlphaComp + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = img1.cols; + oSizeROI.height = img2.rows; + + nppSafeCall( func(img1.ptr(), static_cast(img1.step), img2.ptr(), static_cast(img2.step), + dst.ptr(), static_cast(dst.step), oSizeROI, eAlphaOp) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream) +{ + static const NppiAlphaOp npp_alpha_ops[] = { + NPPI_OP_ALPHA_OVER, + NPPI_OP_ALPHA_IN, + NPPI_OP_ALPHA_OUT, + NPPI_OP_ALPHA_ATOP, + NPPI_OP_ALPHA_XOR, + NPPI_OP_ALPHA_PLUS, + NPPI_OP_ALPHA_OVER_PREMUL, + NPPI_OP_ALPHA_IN_PREMUL, + NPPI_OP_ALPHA_OUT_PREMUL, + NPPI_OP_ALPHA_ATOP_PREMUL, + NPPI_OP_ALPHA_XOR_PREMUL, + NPPI_OP_ALPHA_PLUS_PREMUL, + NPPI_OP_ALPHA_PREMUL + }; + + typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream); + + static const func_t funcs[] = + { + NppAlphaComp::call, + 0, + NppAlphaComp::call, + 0, + NppAlphaComp::call, + NppAlphaComp::call, + 0 + }; + + CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4); + CV_Assert(img1.size() == img2.size() && img1.type() == img2.type()); + + dst.create(img1.size(), img1.type()); + + const func_t func = funcs[img1.depth()]; + CV_Assert(func != 0); + + func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 42a0a39..e9977c6 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -50,7 +50,7 @@ using namespace cv::gpu; Ptr cv::gpu::createFilter2D_GPU(const Ptr&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int, GpuMat& buf) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int, GpuMat&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp index 90ccadc..aba9ee3 100644 --- a/modules/gpu/src/graphcuts.cpp +++ b/modules/gpu/src/graphcuts.cpp @@ -45,12 +45,41 @@ #if !defined (HAVE_CUDA) void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ +namespace +{ + typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem); + + class NppiGraphcutStateHandler + { + public: + NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func) + { + nppSafeCall( func(sznpp, &pState, pDeviceMem) ); + } + + ~NppiGraphcutStateHandler() + { + nppSafeCall( nppiGraphcutFree(pState) ); + } + + operator NppiGraphcutState*() + { + return pState; + } + + private: + NppiGraphcutState* pState; + }; +} + void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s) { Size src_size = terminals.size(); + CV_Assert(terminals.type() == CV_32S); CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width)); CV_Assert(leftTransp.type() == CV_32S); @@ -70,30 +99,76 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans int bufsz; nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) ); - if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize()) - buf.create(1, bufsz, CV_8U); + ensureSizeIsEnough(1, bufsz, CV_8U, buf); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); -#if CUDART_VERSION > 4000 - NppiGraphcutState* pState; - nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr()) ); + NppiGraphcutStateHandler state(sznpp, buf.ptr(), nppiGraphcutInitAlloc); nppSafeCall( nppiGraphcut_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), top.ptr(), bottom.ptr(), - static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), pState) ); - - nppSafeCall( nppiGraphcutFree(pState) ); -#else - nppSafeCall( nppiGraphcut_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), top.ptr(), bottom.ptr(), - static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), buf.ptr()) ); -#endif + static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), state) ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } +void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, + GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s) +{ + Size src_size = terminals.size(); -#endif /* !defined (HAVE_CUDA) */ + CV_Assert(terminals.type() == CV_32S); + + CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width)); + CV_Assert(leftTransp.type() == CV_32S); + + CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width)); + CV_Assert(rightTransp.type() == CV_32S); + + CV_Assert(top.size() == src_size); + CV_Assert(top.type() == CV_32S); + + CV_Assert(topLeft.size() == src_size); + CV_Assert(topLeft.type() == CV_32S); + + CV_Assert(topRight.size() == src_size); + CV_Assert(topRight.type() == CV_32S); + + CV_Assert(bottom.size() == src_size); + CV_Assert(bottom.type() == CV_32S); + + CV_Assert(bottomLeft.size() == src_size); + CV_Assert(bottomLeft.type() == CV_32S); + + CV_Assert(bottomRight.size() == src_size); + CV_Assert(bottomRight.type() == CV_32S); + + labels.create(src_size, CV_8U); + + NppiSize sznpp; + sznpp.width = src_size.width; + sznpp.height = src_size.height; + + int bufsz; + nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) ); + + ensureSizeIsEnough(1, bufsz, CV_8U, buf); + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + + NppiGraphcutStateHandler state(sznpp, buf.ptr(), nppiGraphcut8InitAlloc); + + nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), + top.ptr(), topLeft.ptr(), topRight.ptr(), + bottom.ptr(), bottomLeft.ptr(), bottomRight.ptr(), + static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), state) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + +#endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index ed243a3..205c5b2 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -62,7 +62,6 @@ void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); } void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); } @@ -91,7 +90,7 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, f void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); } void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); } void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); } +void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_nogpu(); } void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); } @@ -780,44 +779,78 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, //////////////////////////////////////////////////////////////////////// // rotate -void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& s) -{ - static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; +namespace +{ + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; }; + + template struct NppRotateFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, + npp_t* pDst, int nDstStep, NppiRect oDstROI, + double nAngle, double nShiftX, double nShiftY, int eInterpolation); + }; - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); - CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC); + template ::func_t func> struct NppRotate + { + typedef typename NppRotateFunc::npp_t npp_t; - dst.create(dsize, src.type()); + static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream) + { + static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; - NppiSize srcsz; - srcsz.height = src.rows; - srcsz.width = src.cols; - NppiRect srcroi; - srcroi.x = srcroi.y = 0; - srcroi.height = src.rows; - srcroi.width = src.cols; - NppiRect dstroi; - dstroi.x = dstroi.y = 0; - dstroi.height = dst.rows; - dstroi.width = dst.cols; + NppStreamHandler h(stream); - cudaStream_t stream = StreamAccessor::getStream(s); + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + NppiRect srcroi; + srcroi.x = srcroi.y = 0; + srcroi.height = src.rows; + srcroi.width = src.cols; + NppiRect dstroi; + dstroi.x = dstroi.y = 0; + dstroi.height = dst.rows; + dstroi.width = dst.cols; + + nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, + dst.ptr(), static_cast(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); - NppStreamHandler h(stream); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} - if (src.type() == CV_8UC1) - { - nppSafeCall( nppiRotate_8u_C1R(src.ptr(), srcsz, static_cast(src.step), srcroi, - dst.ptr(), static_cast(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); - } - else +void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream); + + static const func_t funcs[6][4] = { - nppSafeCall( nppiRotate_8u_C4R(src.ptr(), srcsz, static_cast(src.step), srcroi, - dst.ptr(), static_cast(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); - } + {NppRotate::call, 0, NppRotate::call, NppRotate::call}, + {0,0,0,0}, + {NppRotate::call, 0, NppRotate::call, NppRotate::call}, + {0,0,0,0}, + {0,0,0,0}, + {NppRotate::call, 0, NppRotate::call, NppRotate::call} + }; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F); + CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC); + + dst.create(dsize, src.type()); + + funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -857,30 +890,6 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S cudaSafeCall( cudaDeviceSynchronize() ); } -void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s) -{ - CV_Assert(src.type() == CV_8UC1); - - int width = src.cols + 1, height = src.rows + 1; - - sum.create(height, width, CV_32S); - sqsum.create(height, width, CV_32F); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStreamHandler h(stream); - - nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast(src.ptr()), static_cast(src.step), - sum.ptr(), static_cast(sum.step), sqsum.ptr(), static_cast(sqsum.step), sz, 0, 0.0f, height) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} - ////////////////////////////////////////////////////////////////////////////// // sqrIntegral @@ -935,7 +944,6 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst) void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s) { -#if CUDART_VERSION > 4000 CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1); dst.create(src.size(), CV_32FC1); @@ -959,31 +967,6 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); -#else - CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1); - - dst.create(src.size(), CV_32FC1); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - NppiRect nppRect; - nppRect.height = rect.height; - nppRect.width = rect.width; - nppRect.x = rect.x; - nppRect.y = rect.y; - - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStreamHandler h(stream); - - nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr(), static_cast(src.step), sqr.ptr(), static_cast(sqr.step), - dst.ptr(), static_cast(dst.step), sz, nppRect) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -#endif } @@ -992,25 +975,19 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons namespace { - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize); typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize); template struct NppHistogramEvenFuncC1 { - typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NppTypeTraits::npp_t src_t; typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist, int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer); }; template struct NppHistogramEvenFuncC4 { - typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NppTypeTraits::npp_t src_t; typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer); @@ -1079,7 +1056,7 @@ namespace template struct NppHistogramRangeFuncC1 { - typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NppTypeTraits::npp_t src_t; typedef Npp32s level_t; enum {LEVEL_TYPE_CODE=CV_32SC1}; @@ -1097,7 +1074,7 @@ namespace }; template struct NppHistogramRangeFuncC4 { - typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NppTypeTraits::npp_t src_t; typedef Npp32s level_t; enum {LEVEL_TYPE_CODE=CV_32SC1}; diff --git a/modules/gpu/src/initialization.cpp b/modules/gpu/src/initialization.cpp index 909ab48..e30f878 100644 --- a/modules/gpu/src/initialization.cpp +++ b/modules/gpu/src/initialization.cpp @@ -171,8 +171,8 @@ bool cv::gpu::DeviceInfo::supports(cv::gpu::FeatureSet) const { throw_nogpu(); r bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu(); return false; } void cv::gpu::DeviceInfo::query() { throw_nogpu(); } void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu(); } -void cv::gpu::printCudaDeviceInfo(int device) { throw_nogpu(); } -void cv::gpu::printShortCudaDeviceInfo(int device) { throw_nogpu(); } +void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu(); } +void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -346,7 +346,6 @@ void cv::gpu::printCudaDeviceInfo(int device) convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); -#if (CUDART_VERSION >= 4000) // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock, memBusWidth, L2CacheSize; getCudaAttribute( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); @@ -364,7 +363,7 @@ void cv::gpu::printCudaDeviceInfo(int device) printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); -#endif + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); @@ -375,11 +374,7 @@ void cv::gpu::printCudaDeviceInfo(int device) printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); -#if CUDART_VERSION >= 4000 printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); -#else - printf(" Concurrent copy and execution: %s\n", prop.deviceOverlap ? "Yes" : "No"); -#endif printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); @@ -388,10 +383,8 @@ void cv::gpu::printCudaDeviceInfo(int device) printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); -#if CUDART_VERSION >= 4000 printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); -#endif printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp index 10e4e81..ac23ff1 100644 --- a/modules/gpu/src/matrix_reductions.cpp +++ b/modules/gpu/src/matrix_reductions.cpp @@ -48,6 +48,7 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); } +void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_nogpu(); } double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; } double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_nogpu(); return 0.0; } double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; } @@ -109,6 +110,12 @@ namespace void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) { + GpuMat buf; + meanStdDev(src, mean, stddev, buf); +} + +void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf) +{ CV_Assert(src.type() == CV_8UC1); NppiSize sz; @@ -117,15 +124,12 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) DeviceBuffer dbuf(2); -#if CUDART_VERSION > 4000 int bufSize; nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); - GpuMat buf(1, bufSize, CV_8UC1); + ensureSizeIsEnough(1, bufSize, CV_8UC1, buf); + nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, buf.ptr(), dbuf, (double*)dbuf + 1) ); -#else - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, dbuf, (double*)dbuf + 1) ); -#endif cudaSafeCall( cudaDeviceSynchronize() ); @@ -133,7 +137,6 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) dbuf.download(ptrs); } - //////////////////////////////////////////////////////////////////////// // norm @@ -151,7 +154,7 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf) return absSum(src_single_channel, buf)[0]; if (normType == NORM_L2) - return sqrt(sqrSum(src_single_channel, buf)[0]); + return std::sqrt(sqrSum(src_single_channel, buf)[0]); if (normType == NORM_INF) { diff --git a/modules/gpu/src/optical_flow.cpp b/modules/gpu/src/optical_flow.cpp index cce7007..0921e00 100644 --- a/modules/gpu/src/optical_flow.cpp +++ b/modules/gpu/src/optical_flow.cpp @@ -228,7 +228,7 @@ void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMa minMax(u_avg, 0, &uMax); minMax(v_avg, 0, &vMax); - float max_flow = static_cast(sqrt(uMax * uMax + vMax * vMax)); + float max_flow = static_cast(std::sqrt(uMax * uMax + vMax * vMax)); CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr(), colors.ptr(), max_flow, 1.0f / u.cols, 1.0f / u.rows); diff --git a/modules/gpu/src/optical_flow_farneback.cpp b/modules/gpu/src/optical_flow_farneback.cpp index 9e66222..d428c51 100644 --- a/modules/gpu/src/optical_flow_farneback.cpp +++ b/modules/gpu/src/optical_flow_farneback.cpp @@ -160,7 +160,7 @@ void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double s double ig11, ig03, ig33, ig55; prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55); - device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, ig11, ig03, ig33, ig55); + device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast(ig11), static_cast(ig03), static_cast(ig33), static_cast(ig55)); } diff --git a/modules/gpu/src/orb.cpp b/modules/gpu/src/orb.cpp index 2854404..1166999 100644 --- a/modules/gpu/src/orb.cpp +++ b/modules/gpu/src/orb.cpp @@ -429,11 +429,11 @@ void cv::gpu::ORB_GPU::setParams(size_t n_features, const ORB::CommonParams& det // pre-compute the end of a row in a circular patch int half_patch_size = params_.patch_size_ / 2; vector u_max(half_patch_size + 1); - for (int v = 0; v <= half_patch_size * sqrt(2.f) / 2 + 1; ++v) - u_max[v] = cvRound(sqrt(static_cast(half_patch_size * half_patch_size - v * v))); + for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v) + u_max[v] = cvRound(std::sqrt(static_cast(half_patch_size * half_patch_size - v * v))); // Make sure we are symmetric - for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * sqrt(2.f) / 2; --v) + for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v) { while (u_max[v_0] == u_max[v_0 + 1]) ++v_0; diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index d796bc3..72d781f 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -91,8 +91,8 @@ #include "nvidia/NCVHaarObjectDetection.hpp" #include "nvidia/NCVBroxOpticalFlow.hpp" - #define CUDART_MINIMUM_REQUIRED_VERSION 4000 - #define NPP_MINIMUM_REQUIRED_VERSION 4000 + #define CUDART_MINIMUM_REQUIRED_VERSION 4010 + #define NPP_MINIMUM_REQUIRED_VERSION 4100 #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) #error "Insufficient Cuda Runtime library version, please update it." diff --git a/modules/gpu/test/test_arithm.cpp b/modules/gpu/test/test_arithm.cpp index 770701e..0f57b07 100644 --- a/modules/gpu/test/test_arithm.cpp +++ b/modules/gpu/test/test_arithm.cpp @@ -69,10 +69,10 @@ PARAM_TEST_CASE(ArithmTestBase, cv::gpu::DeviceInfo, MatType, UseRoi) size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); - mat1 = randomMat(rng, size, type, 1, 16, false); - mat2 = randomMat(rng, size, type, 1, 16, false); + mat1 = randomMat(rng, size, type, 5, 16, false); + mat2 = randomMat(rng, size, type, 5, 16, false); - val = cv::Scalar(rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0)); + val = cv::Scalar(rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3)); } }; @@ -115,7 +115,8 @@ TEST_P(Add, Scalar) INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine( ALL_DEVICES, - Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, + CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4), USE_ROI)); //////////////////////////////////////////////////////////////////////////////// @@ -157,7 +158,8 @@ TEST_P(Subtract, Scalar) INSTANTIATE_TEST_CASE_P(Arithm, Subtract, Combine( ALL_DEVICES, - Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, + CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4), USE_ROI)); //////////////////////////////////////////////////////////////////////////////// @@ -199,7 +201,8 @@ TEST_P(Multiply, Scalar) INSTANTIATE_TEST_CASE_P(Arithm, Multiply, Combine( ALL_DEVICES, - Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, + CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4), USE_ROI)); //////////////////////////////////////////////////////////////////////////////// @@ -220,7 +223,7 @@ TEST_P(Divide, Array) gpuRes.download(dst); - EXPECT_MAT_NEAR(dst_gold, dst, 1.0); + EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1); } TEST_P(Divide, Scalar) @@ -236,12 +239,13 @@ TEST_P(Divide, Scalar) gpuRes.download(dst); - EXPECT_MAT_NEAR(dst_gold, dst, 1e-5); + EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1); } INSTANTIATE_TEST_CASE_P(Arithm, Divide, Combine( ALL_DEVICES, - Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, + CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4), USE_ROI)); //////////////////////////////////////////////////////////////////////////////// @@ -313,6 +317,83 @@ INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine( USE_ROI)); //////////////////////////////////////////////////////////////////////////////// +// abs + +struct Abs : ArithmTestBase {}; + +TEST_P(Abs, Array) +{ + cv::Mat dst_gold = cv::abs(mat1); + + cv::Mat dst; + + cv::gpu::GpuMat gpuRes; + + cv::gpu::abs(loadMat(mat1, useRoi), gpuRes); + + gpuRes.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine( + ALL_DEVICES, + Values(CV_16SC1, CV_32FC1), + USE_ROI)); + +//////////////////////////////////////////////////////////////////////////////// +// Sqr + +struct Sqr : ArithmTestBase {}; + +TEST_P(Sqr, Array) +{ + cv::Mat dst_gold; + cv::multiply(mat1, mat1, dst_gold); + + cv::Mat dst; + + cv::gpu::GpuMat gpuRes; + + cv::gpu::sqr(loadMat(mat1, useRoi), gpuRes); + + gpuRes.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(Arithm, Sqr, Combine( + ALL_DEVICES, + Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1), + USE_ROI)); + +//////////////////////////////////////////////////////////////////////////////// +// Sqrt + +struct Sqrt : ArithmTestBase {}; + +TEST_P(Sqrt, Array) +{ + cv::Mat dst_gold; + cv::sqrt(mat1, dst_gold); + + cv::Mat dst; + + cv::gpu::GpuMat gpuRes; + + cv::gpu::sqrt(loadMat(mat1, useRoi), gpuRes); + + gpuRes.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine( + ALL_DEVICES, + Values(MatType(CV_32FC1)), + USE_ROI)); + +//////////////////////////////////////////////////////////////////////////////// // compare PARAM_TEST_CASE(Compare, cv::gpu::DeviceInfo, MatType, CmpCode, UseRoi) @@ -513,7 +594,7 @@ TEST_P(Flip, Accuracy) INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine( ALL_DEVICES, - Values(CV_8UC1, CV_8UC4), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values((int)FLIP_BOTH, (int)FLIP_X, (int)FLIP_Y), USE_ROI)); @@ -1329,6 +1410,90 @@ INSTANTIATE_TEST_CASE_P(Arithm, Bitwise, Combine( ALL_DEVICES, ALL_TYPES)); +PARAM_TEST_CASE(BitwiseScalar, cv::gpu::DeviceInfo, MatType) +{ + cv::gpu::DeviceInfo devInfo; + int type; + + cv::Size size; + cv::Mat mat; + cv::Scalar sc; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + type = GET_PARAM(1); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + + mat.create(size, type); + + for (int i = 0; i < mat.rows; ++i) + { + cv::Mat row(1, static_cast(mat.cols * mat.elemSize()), CV_8U, (void*)mat.ptr(i)); + rng.fill(row, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255)); + } + + sc = cv::Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255)); + } +}; + +TEST_P(BitwiseScalar, Or) +{ + cv::Mat dst_gold; + cv::bitwise_or(mat, sc, dst_gold); + + cv::Mat dst; + + cv::gpu::GpuMat dev_dst; + + cv::gpu::bitwise_or(loadMat(mat), sc, dev_dst); + + dev_dst.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +TEST_P(BitwiseScalar, And) +{ + cv::Mat dst_gold; + cv::bitwise_and(mat, sc, dst_gold); + + cv::Mat dst; + + cv::gpu::GpuMat dev_dst; + + cv::gpu::bitwise_and(loadMat(mat), sc, dev_dst); + + dev_dst.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +TEST_P(BitwiseScalar, Xor) +{ + cv::Mat dst_gold; + cv::bitwise_xor(mat, sc, dst_gold); + + cv::Mat dst; + + cv::gpu::GpuMat dev_dst; + + cv::gpu::bitwise_xor(loadMat(mat), sc, dev_dst); + + dev_dst.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalar, Combine( + ALL_DEVICES, + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4))); + ////////////////////////////////////////////////////////////////////////////// // addWeighted diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index ca68224..f25cb36 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -2378,6 +2378,49 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, Combine( USE_ROI)); /////////////////////////////////////////////////////////////////////////////////////////////////////// +// swapChannels + +PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, UseRoi) +{ + cv::gpu::DeviceInfo devInfo; + bool useRoi; + + cv::Mat img; + + cv::Mat dst_gold; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + useRoi = GET_PARAM(1); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::Mat imgBase = readImage("stereobm/aloe-L.png"); + ASSERT_FALSE(imgBase.empty()); + + cv::cvtColor(imgBase, img, cv::COLOR_BGR2BGRA); + + cv::cvtColor(img, dst_gold, cv::COLOR_BGRA2RGBA); + } +}; + +TEST_P(SwapChannels, Accuracy) +{ + cv::gpu::GpuMat gpuImage = loadMat(img, useRoi); + + const int dstOrder[] = {2, 1, 0, 3}; + cv::gpu::swapChannels(gpuImage, dstOrder); + + cv::Mat dst; + gpuImage.download(dst); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, Combine(ALL_DEVICES, USE_ROI)); + +/////////////////////////////////////////////////////////////////////////////////////////////////////// // histograms struct HistEven : TestWithParam diff --git a/modules/gpu/test/test_main.cpp b/modules/gpu/test/test_main.cpp index ae67c53..a64d7e4 100644 --- a/modules/gpu/test/test_main.cpp +++ b/modules/gpu/test/test_main.cpp @@ -136,7 +136,7 @@ int main(int argc, char** argv) #else // HAVE_CUDA -int main(int argc, char** argv) +int main() { printf("OpenCV was built without CUDA support\n"); return 0; diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp index 5601134..9e107d4 100644 --- a/modules/highgui/src/window.cpp +++ b/modules/highgui/src/window.cpp @@ -146,7 +146,6 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id) default: return -1; } - return -1; } void cv::namedWindow( const string& winname, int flags ) diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp index a6b8972..acd18f0 100644 --- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp @@ -105,6 +105,8 @@ public: GraphCutSeamFinder(int cost_type = COST_COLOR_GRAD, float terminal_cost = 10000.f, float bad_region_penalty = 1000.f); + ~GraphCutSeamFinder(); + void find(const std::vector &src, const std::vector &corners, std::vector &masks); diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp index 13a670e..2f6c78c 100644 --- a/modules/stitching/src/seam_finders.cpp +++ b/modules/stitching/src/seam_finders.cpp @@ -411,6 +411,8 @@ void GraphCutSeamFinder::Impl::findInPair(size_t first, size_t second, Rect roi) GraphCutSeamFinder::GraphCutSeamFinder(int cost_type, float terminal_cost, float bad_region_penalty) : impl_(new Impl(cost_type, terminal_cost, bad_region_penalty)) {} +GraphCutSeamFinder::~GraphCutSeamFinder() {} + void GraphCutSeamFinder::find(const vector &src, const vector &corners, vector &masks) diff --git a/samples/cpp/point_cloud.cpp b/samples/cpp/point_cloud.cpp index 76bb31c..694751f 100644 --- a/samples/cpp/point_cloud.cpp +++ b/samples/cpp/point_cloud.cpp @@ -247,7 +247,7 @@ inline int clamp(int val, int minVal, int maxVal) return max(min(val, maxVal), minVal); } -void PointCloudRenderer::onMouseEvent(int event, int x, int y, int flags) +void PointCloudRenderer::onMouseEvent(int event, int x, int y, int /*flags*/) { static int oldx = x; static int oldy = y; diff --git a/samples/gpu/alpha_comp.cpp b/samples/gpu/alpha_comp.cpp new file mode 100644 index 0000000..7432af9 --- /dev/null +++ b/samples/gpu/alpha_comp.cpp @@ -0,0 +1,68 @@ +#include + +#include "opencv2/core/opengl_interop.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "opencv2/gpu/gpu.hpp" + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +int main() +{ + cout << "This program demonstrates using alphaComp" << endl; + cout << "Press SPACE to change compositing operation" << endl; + cout << "Press ESC to exit" << endl; + + namedWindow("First Image", WINDOW_NORMAL); + namedWindow("Second Image", WINDOW_NORMAL); + namedWindow("Result", WINDOW_OPENGL); + + setGlDevice(); + + Mat src1(640, 480, CV_8UC4, Scalar::all(0)); + Mat src2(640, 480, CV_8UC4, Scalar::all(0)); + + rectangle(src1, Rect(50, 50, 200, 200), Scalar(0, 0, 255, 128), 30); + rectangle(src2, Rect(100, 100, 200, 200), Scalar(255, 0, 0, 128), 30); + + GpuMat d_src1(src1); + GpuMat d_src2(src2); + + GpuMat d_res; + + imshow("First Image", src1); + imshow("Second Image", src2); + + int alpha_op = ALPHA_OVER; + + const char* op_names[] = + { + "ALPHA_OVER", "ALPHA_IN", "ALPHA_OUT", "ALPHA_ATOP", "ALPHA_XOR", "ALPHA_PLUS", "ALPHA_OVER_PREMUL", "ALPHA_IN_PREMUL", "ALPHA_OUT_PREMUL", + "ALPHA_ATOP_PREMUL", "ALPHA_XOR_PREMUL", "ALPHA_PLUS_PREMUL", "ALPHA_PREMUL" + }; + + while (true) + { + cout << op_names[alpha_op] << endl; + + alphaComp(d_src1, d_src2, d_res, alpha_op); + + imshow("Result", d_res); + + char key = static_cast(waitKey()); + + if (key == 27) + break; + + if (key == 32) + { + ++alpha_op; + + if (alpha_op > ALPHA_PREMUL) + alpha_op = ALPHA_OVER; + } + } + + return 0; +} diff --git a/samples/gpu/brox_optical_flow.cpp b/samples/gpu/brox_optical_flow.cpp index 5b19fb5..0e956f5 100644 --- a/samples/gpu/brox_optical_flow.cpp +++ b/samples/gpu/brox_optical_flow.cpp @@ -226,7 +226,7 @@ int main(int argc, const char* argv[]) break; case 'S': - if (currentFrame < frames.size() - 1) + if (currentFrame < static_cast(frames.size()) - 1) ++currentFrame; imshow("Interpolated frame", frames[currentFrame]); diff --git a/samples/gpu/farneback_optical_flow.cpp b/samples/gpu/farneback_optical_flow.cpp index a88b2da..ddfe075 100644 --- a/samples/gpu/farneback_optical_flow.cpp +++ b/samples/gpu/farneback_optical_flow.cpp @@ -26,7 +26,7 @@ void colorizeFlow(const Mat &u, const Mat &v, Mat &dst) minMaxLoc(v, &vMin, &vMax, 0, 0); uMin = ::abs(uMin); uMax = ::abs(uMax); vMin = ::abs(vMin); vMax = ::abs(vMax); - float dMax = ::max(::max(uMin, uMax), ::max(vMin, vMax)); + float dMax = static_cast(::max(::max(uMin, uMax), ::max(vMin, vMax))); dst.create(u.size(), CV_8UC3); for (int y = 0; y < u.rows; ++y) @@ -111,11 +111,11 @@ int main(int argc, char **argv) s.str(""); s << "opt. flow FPS: " << cvRound((getTickFrequency()/(tc1-tc0))); - putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.); + putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2); s.str(""); s << "total FPS: " << cvRound((getTickFrequency()/(t1-t0))); - putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.); + putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2); imshow("flow", image); diff --git a/samples/gpu/performance/performance.cpp b/samples/gpu/performance/performance.cpp index 2d65656..e268d65 100644 --- a/samples/gpu/performance/performance.cpp +++ b/samples/gpu/performance/performance.cpp @@ -63,7 +63,7 @@ void TestSystem::finishCurrentSubtest() double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0; double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0; - double speedup = static_cast(cpu_elapsed_) / std::max((int64)1, gpu_elapsed_); + double speedup = static_cast(cpu_elapsed_) / std::max(1.0, gpu_elapsed_); speedup_total_ += speedup; printMetrics(cpu_time, gpu_time, speedup); diff --git a/samples/gpu/performance/performance.h b/samples/gpu/performance/performance.h index b5b2d13..53b12a4 100644 --- a/samples/gpu/performance/performance.h +++ b/samples/gpu/performance/performance.h @@ -127,8 +127,10 @@ private: std::stringstream cur_subtest_description_; bool cur_subtest_is_empty_; - int64 cpu_started_, cpu_elapsed_; - int64 gpu_started_, gpu_elapsed_; + int64 cpu_started_; + int64 gpu_started_; + double cpu_elapsed_; + double gpu_elapsed_; double speedup_total_; int num_subtests_called_; diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index a9f35d5..cb51abd 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -1199,7 +1199,7 @@ TEST(FarnebackOpticalFlow) if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png"); gpu::FarnebackOpticalFlow calc; - calc.fastPyramids = fastPyramids; + calc.fastPyramids = fastPyramids != 0; calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0; gpu::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy; -- 2.7.4