From 8fcef225fb89ec8047b72f068fc51065d7eb308a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 26 Apr 2013 14:40:44 +0400 Subject: [PATCH] switched to Input/Output Array in reductions operations --- modules/gpu/src/cascadeclassifier.cpp | 2 +- modules/gpuarithm/include/opencv2/gpuarithm.hpp | 145 ++++++++---- modules/gpuarithm/perf/perf_arithm.cpp | 6 +- modules/gpuarithm/perf/perf_reductions.cpp | 3 +- modules/gpuarithm/src/arithm.cpp | 117 --------- modules/gpuarithm/src/reductions.cpp | 302 ++++++++++++++---------- modules/gpuimgproc/src/match_template.cpp | 2 +- modules/nonfree/src/surf_gpu.cpp | 4 +- samples/gpu/driver_api_multi.cpp | 2 +- samples/gpu/farneback_optical_flow.cpp | 4 +- samples/gpu/multi.cpp | 2 +- 11 files changed, 299 insertions(+), 290 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 0f1da83..74867b4 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -458,7 +458,7 @@ public: // generate integral for scale gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR); - gpu::integralBuffered(src, sint, buff); + gpu::integral(src, sint, buff); // calculate job int totalWidth = level.workArea.width / step; diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp index 5c51186..b131aba 100644 --- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp +++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp @@ -209,85 +209,150 @@ inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream) CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null()); -//! implements generalized matrix product algorithm GEMM from BLAS -CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, - const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); - -//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values -CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, - int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat()); -CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b, - int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf); - //! computes norm of array //! supports NORM_INF, NORM_L1, NORM_L2 //! supports all matrices except 64F -CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2); -CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf); -CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf); +CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf); +static inline double norm(InputArray src, int normType) +{ + GpuMat buf; + return norm(src, normType, GpuMat(), buf); +} +static inline double norm(InputArray src, int normType, GpuMat& buf) +{ + return norm(src, normType, GpuMat(), buf); +} //! computes norm of the difference between two arrays //! supports NORM_INF, NORM_L1, NORM_L2 //! supports only CV_8UC1 type -CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2); +CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2); +static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2) +{ + GpuMat buf; + return norm(src1, src2, buf, normType); +} //! computes sum of array elements //! supports only single channel images -CV_EXPORTS Scalar sum(const GpuMat& src); -CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); -CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf); +CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf); +static inline Scalar sum(InputArray src) +{ + GpuMat buf; + return sum(src, GpuMat(), buf); +} +static inline Scalar sum(InputArray src, GpuMat& buf) +{ + return sum(src, GpuMat(), buf); +} //! computes sum of array elements absolute values //! supports only single channel images -CV_EXPORTS Scalar absSum(const GpuMat& src); -CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf); -CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf); +CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf); +static inline Scalar absSum(InputArray src) +{ + GpuMat buf; + return absSum(src, GpuMat(), buf); +} +static inline Scalar absSum(InputArray src, GpuMat& buf) +{ + return absSum(src, GpuMat(), buf); +} //! computes squared sum of array elements //! supports only single channel images -CV_EXPORTS Scalar sqrSum(const GpuMat& src); -CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); -CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf); +CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf); +static inline Scalar sqrSum(InputArray src) +{ + GpuMat buf; + return sqrSum(src, GpuMat(), buf); +} +static inline Scalar sqrSum(InputArray src, GpuMat& buf) +{ + return sqrSum(src, GpuMat(), buf); +} //! finds global minimum and maximum array elements and returns their values -CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); -CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf); +CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf); +static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray()) +{ + GpuMat buf; + minMax(src, minVal, maxVal, mask, buf); +} //! finds global minimum and maximum array elements and returns their values with locations -CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, - const GpuMat& mask=GpuMat()); -CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf); +CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + InputArray mask, GpuMat& valbuf, GpuMat& locbuf); +static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, + InputArray mask=noArray()) +{ + GpuMat valBuf, locBuf; + minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf); +} //! counts non-zero array elements -CV_EXPORTS int countNonZero(const GpuMat& src); -CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); +CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf); +static inline int countNonZero(const GpuMat& src) +{ + GpuMat buf; + return countNonZero(src, buf); +} //! reduces a matrix to a vector -CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null()); +CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null()); //! computes mean value and standard deviation of all or selected array elements //! supports only CV_8UC1 type -CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); -//! buffered version -CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf); +CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf); +static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev) +{ + GpuMat buf; + meanStdDev(src, mean, stddev, buf); +} //! computes the standard deviation of integral images //! supports only CV_32SC1 source type and CV_32FC1 sqr type //! output will have CV_32FC1 type -CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null()); +CV_EXPORTS void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null()); + +//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values +CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta, + int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf); +static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, + int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray()) +{ + GpuMat norm_buf; + GpuMat cvt_buf; + normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf); +} //! computes the integral image //! sum will have CV_32S type, but will contain unsigned int values //! supports only CV_8UC1 source type -CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()); -//! buffered version -CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null()); +CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null()); +static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null()) +{ + integral(src, sum, buffer, stream); +} +static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null()) +{ + GpuMat buffer; + integral(src, sum, buffer, stream); +} //! computes squared integral image //! result matrix will have 64F type, but will contain 64U values //! supports source images of 8UC1 type only -CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null()); +CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null()); +static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null()) +{ + GpuMat buffer; + sqrIntegral(src, sqsum, buffer, stream); +} + +//! implements generalized matrix product algorithm GEMM from BLAS +CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, + const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); //! performs per-element multiplication of two full (not packed) Fourier spectrums //! supports 32FC2 matrixes only (interleaved format) diff --git a/modules/gpuarithm/perf/perf_arithm.cpp b/modules/gpuarithm/perf/perf_arithm.cpp index b553fc2..5f15fb4 100644 --- a/modules/gpuarithm/perf/perf_arithm.cpp +++ b/modules/gpuarithm/perf/perf_arithm.cpp @@ -265,7 +265,7 @@ PERF_TEST_P(Sz, Integral, cv::gpu::GpuMat dst; cv::gpu::GpuMat d_buf; - TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf); + TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf); GPU_SANITY_CHECK(dst); } @@ -293,9 +293,9 @@ PERF_TEST_P(Sz, IntegralSqr, if (PERF_RUN_GPU()) { const cv::gpu::GpuMat d_src(src); - cv::gpu::GpuMat dst; + cv::gpu::GpuMat dst, buf; - TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst); + TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf); GPU_SANITY_CHECK(dst); } diff --git a/modules/gpuarithm/perf/perf_reductions.cpp b/modules/gpuarithm/perf/perf_reductions.cpp index 8d73180..c541ce0 100644 --- a/modules/gpuarithm/perf/perf_reductions.cpp +++ b/modules/gpuarithm/perf/perf_reductions.cpp @@ -108,9 +108,10 @@ PERF_TEST_P(Sz_Norm, NormDiff, { const cv::gpu::GpuMat d_src1(src1); const cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat d_buf; double gpu_dst; - TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, normType); + TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, d_buf, normType); SANITY_CHECK(gpu_dst); diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp index a6cd1cb..210097f 100644 --- a/modules/gpuarithm/src/arithm.cpp +++ b/modules/gpuarithm/src/arithm.cpp @@ -49,11 +49,6 @@ using namespace cv::gpu; void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); } -void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } -void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } - -void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } - void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); } void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); } @@ -294,116 +289,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G #endif } -//////////////////////////////////////////////////////////////////////// -// integral - -void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s) -{ - GpuMat buffer; - gpu::integralBuffered(src, sum, buffer, s); -} - -namespace cv { namespace gpu { namespace cudev -{ - namespace imgproc - { - void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream); - } -}}} - -void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s) -{ - CV_Assert(src.type() == CV_8UC1); - - cudaStream_t stream = StreamAccessor::getStream(s); - - cv::Size whole; - cv::Point offset; - - src.locateROI(whole, offset); - - if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 - && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast(src.step) - offset.x)) - { - ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); - - cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream); - - sum.create(src.rows + 1, src.cols + 1, CV_32SC1); - - sum.setTo(Scalar::all(0), s); - - GpuMat inner = sum(Rect(1, 1, src.cols, src.rows)); - GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - - res.copyTo(inner, s); - } - else - { -#ifndef HAVE_OPENCV_GPULEGACY - throw_no_cuda(); -#else - sum.create(src.rows + 1, src.cols + 1, CV_32SC1); - - NcvSize32u roiSize; - roiSize.width = src.cols; - roiSize.height = src.rows; - - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); - - Ncv32u bufSize; - ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); - ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); - - NppStStreamHandler h(stream); - - ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), - sum.ptr(), static_cast(sum.step), roiSize, buffer.ptr(), bufSize, prop) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -#endif - } -} - -////////////////////////////////////////////////////////////////////////////// -// sqrIntegral - -void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s) -{ -#ifndef HAVE_OPENCV_GPULEGACY - (void) src; - (void) sqsum; - (void) s; - throw_no_cuda(); -#else - CV_Assert(src.type() == CV_8U); - - NcvSize32u roiSize; - roiSize.width = src.cols; - roiSize.height = src.rows; - - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); - - Ncv32u bufSize; - ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); - GpuMat buf(1, bufSize, CV_8U); - - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStStreamHandler h(stream); - - sqsum.create(src.rows + 1, src.cols + 1, CV_64F); - ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), static_cast(src.step), - sqsum.ptr(0), static_cast(sqsum.step), roiSize, buf.ptr(0), bufSize, prop)); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -#endif -} - ////////////////////////////////////////////////////////////////////////////// // mulSpectrums @@ -650,8 +535,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, (void) stream; throw_no_cuda(); #else - using namespace cv::gpu::cudev::imgproc; - CV_Assert(image.type() == CV_32F); CV_Assert(templ.type() == CV_32F); diff --git a/modules/gpuarithm/src/reductions.cpp b/modules/gpuarithm/src/reductions.cpp index b8b2418..248fa9a 100644 --- a/modules/gpuarithm/src/reductions.cpp +++ b/modules/gpuarithm/src/reductions.cpp @@ -47,41 +47,28 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) -double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; } -double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; } -double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; } -double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; } +double cv::gpu::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; } +double cv::gpu::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; } -Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } +Scalar cv::gpu::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } +Scalar cv::gpu::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } +Scalar cv::gpu::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } +void cv::gpu::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); } +void cv::gpu::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); } -Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); } +int cv::gpu::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; } -void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); } -void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); } +void cv::gpu::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); } -void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); } -void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); } +void cv::gpu::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); } -int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; } -int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; } +void cv::gpu::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); } -void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); } +void cv::gpu::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); } -void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); } -void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); } - -void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_no_cuda(); } - -void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); } -void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); } +void cv::gpu::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); } +void cv::gpu::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); } #else @@ -124,21 +111,13 @@ namespace //////////////////////////////////////////////////////////////////////// // norm -double cv::gpu::norm(const GpuMat& src, int normType) -{ - GpuMat buf; - return gpu::norm(src, normType, GpuMat(), buf); -} - -double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf) +double cv::gpu::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf) { - return gpu::norm(src, normType, GpuMat(), buf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf) -{ - CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2); - CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1)); + CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) ); GpuMat src_single_channel = src.reshape(1); @@ -154,13 +133,11 @@ double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat return std::max(std::abs(min_val), std::abs(max_val)); } -double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) +double cv::gpu::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType) { - CV_Assert(src1.type() == CV_8UC1); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2); - #if CUDA_VERSION < 5050 + (void) buf; + typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal); static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R}; @@ -175,13 +152,18 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R}; #endif + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + CV_Assert( src1.type() == CV_8UC1 ); + CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() ); + CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 ); + NppiSize sz; sz.width = src1.cols; sz.height = src1.rows; - int funcIdx = normType >> 1; - - double retVal; + const int funcIdx = normType >> 1; DeviceBuffer dbuf; @@ -191,13 +173,14 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) int bufSize; buf_size_funcs[funcIdx](sz, &bufSize); - GpuMat buf(1, bufSize, CV_8UC1); + ensureSizeIsEnough(1, bufSize, CV_8UC1, buf); nppSafeCall( funcs[funcIdx](src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), sz, dbuf, buf.data) ); #endif cudaSafeCall( cudaDeviceSynchronize() ); + double retVal; dbuf.download(&retVal); return retVal; @@ -220,19 +203,11 @@ namespace sum void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); } -Scalar cv::gpu::sum(const GpuMat& src) +Scalar cv::gpu::sum(InputArray _src, InputArray _mask, GpuMat& buf) { - GpuMat buf; - return gpu::sum(src, GpuMat(), buf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) -{ - return gpu::sum(src, GpuMat(), buf); -} - -Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf) -{ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); static const func_t funcs[7][5] = { @@ -266,19 +241,11 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf) return Scalar(result[0], result[1], result[2], result[3]); } -Scalar cv::gpu::absSum(const GpuMat& src) -{ - GpuMat buf; - return gpu::absSum(src, GpuMat(), buf); -} - -Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) +Scalar cv::gpu::absSum(InputArray _src, InputArray _mask, GpuMat& buf) { - return gpu::absSum(src, GpuMat(), buf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf) -{ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); static const func_t funcs[7][5] = { @@ -312,19 +279,11 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf) return Scalar(result[0], result[1], result[2], result[3]); } -Scalar cv::gpu::sqrSum(const GpuMat& src) -{ - GpuMat buf; - return gpu::sqrSum(src, GpuMat(), buf); -} - -Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) +Scalar cv::gpu::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf) { - return gpu::sqrSum(src, GpuMat(), buf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf) -{ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); static const func_t funcs[7][5] = { @@ -369,14 +328,11 @@ namespace minMax void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); } -void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask) +void cv::gpu::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf) { - GpuMat buf; - gpu::minMax(src, minVal, maxVal, mask, buf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) -{ typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); static const func_t funcs[] = { @@ -419,15 +375,12 @@ namespace minMaxLoc void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); } -void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask) +void cv::gpu::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + InputArray _mask, GpuMat& valBuf, GpuMat& locBuf) { - GpuMat valBuf, locBuf; - gpu::minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf); -} + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); -void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf) -{ typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); static const func_t funcs[] = { @@ -472,14 +425,10 @@ namespace countNonZero int run(const PtrStepSzb src, PtrStep buf); } -int cv::gpu::countNonZero(const GpuMat& src) +int cv::gpu::countNonZero(InputArray _src, GpuMat& buf) { - GpuMat buf; - return countNonZero(src, buf); -} + GpuMat src = _src.getGpuMat(); -int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) -{ typedef int (*func_t)(const PtrStepSzb src, PtrStep buf); static const func_t funcs[] = { @@ -521,8 +470,10 @@ namespace reduce void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); } -void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream) +void cv::gpu::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream) { + GpuMat src = _src.getGpuMat(); + CV_Assert( src.channels() <= 4 ); CV_Assert( dim == 0 || dim == 1 ); CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN ); @@ -530,7 +481,8 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int if (dtype < 0) dtype = src.depth(); - dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + GpuMat dst = _dst.getGpuMat(); if (dim == 0) { @@ -691,15 +643,11 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int //////////////////////////////////////////////////////////////////////// // meanStdDev -void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) +void cv::gpu::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf) { - GpuMat buf; - meanStdDev(src, mean, stddev, buf); -} + GpuMat src = _src.getGpuMat(); -void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf) -{ - CV_Assert(src.type() == CV_8UC1); + CV_Assert( src.type() == CV_8UC1 ); if (!deviceSupports(FEATURE_SET_COMPUTE_13)) CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); @@ -730,11 +678,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat ////////////////////////////////////////////////////////////////////////////// // rectStdDev -void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s) +void cv::gpu::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream) { - CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1); + GpuMat src = _src.getGpuMat(); + GpuMat sqr = _sqr.getGpuMat(); + + CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 ); - dst.create(src.size(), CV_32FC1); + _dst.create(src.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); NppiSize sz; sz.width = src.cols; @@ -746,7 +698,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons nppRect.x = rect.x; nppRect.y = rect.y; - cudaStream_t stream = StreamAccessor::getStream(s); + cudaStream_t stream = StreamAccessor::getStream(_stream); NppStreamHandler h(stream); @@ -760,16 +712,12 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons //////////////////////////////////////////////////////////////////////// // normalize -void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask) +void cv::gpu::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf) { - GpuMat norm_buf; - GpuMat cvt_buf; - normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf); -} + GpuMat src = _src.getGpuMat(); -void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf) -{ double scale = 1, shift = 0; + if (norm_type == NORM_MINMAX) { double smin = 0, smax = 0; @@ -800,4 +748,116 @@ void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int } } +//////////////////////////////////////////////////////////////////////// +// integral + +namespace cv { namespace gpu { namespace cudev +{ + namespace imgproc + { + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream); + } +}}} + +void cv::gpu::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream) +{ + GpuMat src = _src.getGpuMat(); + + CV_Assert( src.type() == CV_8UC1 ); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + cv::Size whole; + cv::Point offset; + src.locateROI(whole, offset); + + if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 + && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast(src.step) - offset.x)) + { + ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); + + cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream); + + _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); + GpuMat dst = _dst.getGpuMat(); + + dst.setTo(Scalar::all(0), _stream); + + GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); + GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); + + res.copyTo(inner, _stream); + } + else + { + #ifndef HAVE_OPENCV_GPULEGACY + throw_no_cuda(); + #else + _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); + GpuMat dst = _dst.getGpuMat(); + + NcvSize32u roiSize; + roiSize.width = src.cols; + roiSize.height = src.rows; + + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + + Ncv32u bufSize; + ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); + ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); + + NppStStreamHandler h(stream); + + ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), + dst.ptr(), static_cast(dst.step), roiSize, buffer.ptr(), bufSize, prop) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + #endif + } +} + +////////////////////////////////////////////////////////////////////////////// +// sqrIntegral + +void cv::gpu::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream) +{ +#ifndef HAVE_OPENCV_GPULEGACY + (void) _src; + (void) _dst; + (void) _stream; + throw_no_cuda(); +#else + GpuMat src = _src.getGpuMat(); + + CV_Assert( src.type() == CV_8U ); + + NcvSize32u roiSize; + roiSize.width = src.cols; + roiSize.height = src.rows; + + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + + Ncv32u bufSize; + ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); + + ensureSizeIsEnough(1, bufSize, CV_8U, buf); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + NppStStreamHandler h(stream); + + _dst.create(src.rows + 1, src.cols + 1, CV_64F); + GpuMat dst = _dst.getGpuMat(); + + ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), static_cast(src.step), + dst.ptr(0), static_cast(dst.step), roiSize, buf.ptr(0), bufSize, prop)); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +#endif +} + #endif diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp index 008d3da..c5375c2 100644 --- a/modules/gpuimgproc/src/match_template.cpp +++ b/modules/gpuimgproc/src/match_template.cpp @@ -268,7 +268,7 @@ namespace buf.image_sums.resize(1); gpu::integral(image, buf.image_sums[0], stream); - unsigned int templ_sum = (unsigned int)sum(templ)[0]; + unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0]; matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream)); } else diff --git a/modules/nonfree/src/surf_gpu.cpp b/modules/nonfree/src/surf_gpu.cpp index 82ade29..3580547 100644 --- a/modules/nonfree/src/surf_gpu.cpp +++ b/modules/nonfree/src/surf_gpu.cpp @@ -142,13 +142,13 @@ namespace bindImgTex(img); - gpu::integralBuffered(img, surf_.sum, surf_.intBuffer); + gpu::integral(img, surf_.sum, surf_.intBuffer); sumOffset = bindSumTex(surf_.sum); if (use_mask) { gpu::min(mask, 1.0, surf_.mask1); - gpu::integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); + gpu::integral(surf_.mask1, surf_.maskSum, surf_.intBuffer); maskOffset = bindMaskSumTex(surf_.maskSum); } } diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp index a5343df..1dfe212 100644 --- a/samples/gpu/driver_api_multi.cpp +++ b/samples/gpu/driver_api_multi.cpp @@ -138,7 +138,7 @@ void Worker::operator()(int device_id) const gpu::transpose(d_src, d_dst); // Check results - bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3; + bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3; std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): " << (passed ? "passed" : "FAILED") << endl; diff --git a/samples/gpu/farneback_optical_flow.cpp b/samples/gpu/farneback_optical_flow.cpp index c93ceb0..c2a5d41 100644 --- a/samples/gpu/farneback_optical_flow.cpp +++ b/samples/gpu/farneback_optical_flow.cpp @@ -22,9 +22,9 @@ inline T mapVal(T x, T a, T b, T c, T d) static void colorizeFlow(const Mat &u, const Mat &v, Mat &dst) { double uMin, uMax; - minMaxLoc(u, &uMin, &uMax, 0, 0); + cv::minMaxLoc(u, &uMin, &uMax, 0, 0); double vMin, vMax; - minMaxLoc(v, &vMin, &vMax, 0, 0); + cv::minMaxLoc(v, &vMin, &vMax, 0, 0); uMin = ::abs(uMin); uMax = ::abs(uMax); vMin = ::abs(vMin); vMax = ::abs(vMax); float dMax = static_cast(::max(::max(uMin, uMax), ::max(vMin, vMax))); diff --git a/samples/gpu/multi.cpp b/samples/gpu/multi.cpp index 0e9bef6..c6e6aa3 100644 --- a/samples/gpu/multi.cpp +++ b/samples/gpu/multi.cpp @@ -95,7 +95,7 @@ void Worker::operator()(int device_id) const gpu::transpose(d_src, d_dst); // Check results - bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3; + bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3; std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): " << (passed ? "passed" : "FAILED") << endl; -- 2.7.4