From df8529377b95baa9f96159fb245c7625935925b9 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Mon, 20 Dec 2010 09:51:25 +0000 Subject: [PATCH] refactoring: moved gpu reduction-based functions into separated file --- modules/gpu/include/opencv2/gpu/gpu.hpp | 145 +-- modules/gpu/src/arithm.cpp | 508 -------- modules/gpu/src/cuda/element_operations.cu | 123 ++ modules/gpu/src/cuda/mathfunc.cu | 1719 +--------------------------- modules/gpu/src/cuda/matrix_reductions.cu | 1609 ++++++++++++++++++++++++++ modules/gpu/src/element_operations.cpp | 152 ++- modules/gpu/src/matrix_reductions.cpp | 423 +++++++ 7 files changed, 2398 insertions(+), 2281 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 0b10293..a8c3a14 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -360,66 +360,17 @@ namespace cv friend struct StreamAccessor; }; + ////////////////////////////// Arithmetics /////////////////////////////////// //! transposes the matrix //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst); - //! computes mean value and standard deviation of all or selected array elements - //! supports only CV_8UC1 type - CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); - - //! computes norm of array - //! supports NORM_INF, NORM_L1, NORM_L2 - //! supports only CV_8UC1 type - CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2); - - //! computes norm of the difference between two arrays - //! supports NORM_INF, NORM_L1, NORM_L2 - //! supports only CV_8UC1 type - CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2); - //! reverses the order of the rows, columns or both in a matrix //! supports CV_8UC1, CV_8UC4 types CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode); - //! computes sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sum(const GpuMat& src); - - //! computes sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); - - //! computes squared sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sqrSum(const GpuMat& src); - - //! computes squared sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); - - //! finds global minimum and maximum array elements and returns their values - CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); - - //! finds global minimum and maximum array elements and returns their values - CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf); - - //! finds global minimum and maximum array elements and returns their values with locations - CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, - const GpuMat& mask=GpuMat()); - - //! finds global minimum and maximum array elements and returns their values with locations - CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf); - - //! counts non-zero array elements - CV_EXPORTS int countNonZero(const GpuMat& src); - - //! counts non-zero array elements - CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); - //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! destination array will have the depth type as lut and the same channels number as source //! supports CV_8UC1, CV_8UC3 types @@ -487,25 +438,6 @@ namespace cv //! async version CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream); - //! computes per-element minimum of two arrays (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); - //! Async version - CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); - - //! computes per-element minimum of array and scalar (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst); - //! Async version - CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); - - //! computes per-element maximum of two arrays (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); - //! Async version - CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); - - //! computes per-element maximum of array and scalar (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst); - //! Async version - CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); //////////////////////////// Per-element operations //////////////////////////////////// @@ -576,6 +508,26 @@ namespace cv //! async version CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream); + //! computes per-element minimum of two arrays (dst = min(src1, src2)) + CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); + //! Async version + CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); + + //! computes per-element minimum of array and scalar (dst = min(src1, src2)) + CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst); + //! Async version + CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); + + //! computes per-element maximum of two arrays (dst = max(src1, src2)) + CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); + //! Async version + CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream); + + //! computes per-element maximum of array and scalar (dst = max(src1, src2)) + CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst); + //! Async version + CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream); + ////////////////////////////// Image processing ////////////////////////////// @@ -663,15 +615,66 @@ namespace cv //! computes Harris cornerness criteria at each image pixel CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); - //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101); - //! computes the proximity map for the raster template and the image where the template is searched for CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method); + ////////////////////////////// Matrix reductions ////////////////////////////// + + //! computes mean value and standard deviation of all or selected array elements + //! supports only CV_8UC1 type + CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); + + //! computes norm of array + //! supports NORM_INF, NORM_L1, NORM_L2 + //! supports only CV_8UC1 type + CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2); + + //! computes norm of the difference between two arrays + //! supports NORM_INF, NORM_L1, NORM_L2 + //! supports only CV_8UC1 type + CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2); + + //! computes sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sum(const GpuMat& src); + + //! computes sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); + + //! computes squared sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sqrSum(const GpuMat& src); + + //! computes squared sum of array elements + //! supports only single channel images + CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); + + //! finds global minimum and maximum array elements and returns their values + CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); + + //! finds global minimum and maximum array elements and returns their values + CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf); + + //! finds global minimum and maximum array elements and returns their values with locations + CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, + const GpuMat& mask=GpuMat()); + + //! finds global minimum and maximum array elements and returns their values with locations + CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf); + + //! counts non-zero array elements + CV_EXPORTS int countNonZero(const GpuMat& src); + + //! counts non-zero array elements + CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); + + //////////////////////////////// Filter Engine //////////////////////////////// /*! diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 56ba525..9d20a17 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -49,20 +49,7 @@ using namespace std; #if !defined (HAVE_CUDA) void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); } -double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; } -double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; } void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } -Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } -Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } -Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); } -Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } -void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); } -void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); } -void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } -int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; } -int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); } @@ -78,14 +65,6 @@ void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); } void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); } void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); } -void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); } -void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -119,54 +98,6 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) } //////////////////////////////////////////////////////////////////////// -// meanStdDev - -void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) -{ - CV_Assert(src.type() == CV_8UC1); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), src.step, sz, mean.val, stddev.val) ); -} - -//////////////////////////////////////////////////////////////////////// -// norm - -double cv::gpu::norm(const GpuMat& src1, int normType) -{ - return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType); -} - -double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) -{ - CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); - - CV_Assert(src1.type() == CV_8UC1); - CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2); - - typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, - NppiSize oSizeROI, Npp64f* pRetVal); - - static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R}; - - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; - - int funcIdx = normType >> 1; - double retVal; - - nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr(), src1.step, - src2.ptr(), src2.step, - sz, &retVal) ); - - return retVal; -} - -//////////////////////////////////////////////////////////////////////// // flip void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) @@ -194,305 +125,6 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) } //////////////////////////////////////////////////////////////////////// -// sum - -namespace cv { namespace gpu { namespace mathfunc -{ - template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); - - template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); - - template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); - - template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); - - namespace sum - { - void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); - } -}}} - -Scalar cv::gpu::sum(const GpuMat& src) -{ - GpuMat buf; - return sum(src, buf); -} - -Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) -{ - using namespace mathfunc; - - typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); - static const Caller callers[2][7] = - { { sum_multipass_caller, sum_multipass_caller, - sum_multipass_caller, sum_multipass_caller, - sum_multipass_caller, sum_multipass_caller, 0 }, - { sum_caller, sum_caller, - sum_caller, sum_caller, - sum_caller, sum_caller, 0 } }; - - Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); - buf.create(bufSize, CV_8U); - - Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; - if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); - - double result[4]; - caller(src, buf, result, src.channels()); - return Scalar(result[0], result[1], result[2], result[3]); -} - -Scalar cv::gpu::sqrSum(const GpuMat& src) -{ - GpuMat buf; - return sqrSum(src, buf); -} - -Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) -{ - using namespace mathfunc; - - typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); - static const Caller callers[2][7] = - { { sqsum_multipass_caller, sqsum_multipass_caller, - sqsum_multipass_caller, sqsum_multipass_caller, - sqsum_multipass_caller, sqsum_multipass_caller, 0 }, - { sqsum_caller, sqsum_caller, - sqsum_caller, sqsum_caller, - sqsum_caller, sqsum_caller, 0 } }; - - Size bufSize; - sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); - buf.create(bufSize, CV_8U); - - Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; - if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); - - double result[4]; - caller(src, buf, result, src.channels()); - return Scalar(result[0], result[1], result[2], result[3]); -} - -//////////////////////////////////////////////////////////////////////// -// minMax - -namespace cv { namespace gpu { namespace mathfunc { namespace minmax { - - void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); - - template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); - - template - void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); - - template - void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); - - template - void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); - -}}}} - -void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask) -{ - GpuMat buf; - minMax(src, minVal, maxVal, mask, buf); -} - -void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) -{ - using namespace mathfunc::minmax; - - typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); - typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); - - static const Caller callers[2][7] = - { { min_max_multipass_caller, min_max_multipass_caller, - min_max_multipass_caller, min_max_multipass_caller, - min_max_multipass_caller, min_max_multipass_caller, 0 }, - { min_max_caller, min_max_caller, - min_max_caller, min_max_caller, - min_max_caller, min_max_caller, min_max_caller } }; - - static const MaskedCaller masked_callers[2][7] = - { { min_max_mask_multipass_caller, min_max_mask_multipass_caller, - min_max_mask_multipass_caller, min_max_mask_multipass_caller, - min_max_mask_multipass_caller, min_max_mask_multipass_caller, 0 }, - { min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller, min_max_mask_caller, - min_max_mask_caller } }; - - - CV_Assert(src.channels() == 1); - CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); - CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); - - double minVal_; if (!minVal) minVal = &minVal_; - double maxVal_; if (!maxVal) maxVal = &maxVal_; - - Size bufSize; - get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); - buf.create(bufSize, CV_8U); - - if (mask.empty()) - { - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; - if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); - caller(src, minVal, maxVal, buf); - } - else - { - MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; - if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); - caller(src, mask, minVal, maxVal, buf); - } -} - - -//////////////////////////////////////////////////////////////////////// -// minMaxLoc - -namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { - - void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, - int& b1rows, int& b2cols, int& b2rows); - - template - void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); - - template - void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); - - template - void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); - - template - void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); - - -}}}} - -void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask) -{ - GpuMat valbuf, locbuf; - minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf); -} - -void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf) -{ - using namespace mathfunc::minmaxloc; - - typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - - static const Caller callers[2][7] = - { { min_max_loc_multipass_caller, min_max_loc_multipass_caller, - min_max_loc_multipass_caller, min_max_loc_multipass_caller, - min_max_loc_multipass_caller, min_max_loc_multipass_caller, 0 }, - { min_max_loc_caller, min_max_loc_caller, - min_max_loc_caller, min_max_loc_caller, - min_max_loc_caller, min_max_loc_caller, min_max_loc_caller } }; - - static const MaskedCaller masked_callers[2][7] = - { { min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, - min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, - min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, 0 }, - { min_max_loc_mask_caller, min_max_loc_mask_caller, - min_max_loc_mask_caller, min_max_loc_mask_caller, - min_max_loc_mask_caller, min_max_loc_mask_caller, min_max_loc_mask_caller } }; - - CV_Assert(src.channels() == 1); - CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); - CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); - - double minVal_; if (!minVal) minVal = &minVal_; - double maxVal_; if (!maxVal) maxVal = &maxVal_; - int minLoc_[2]; - int maxLoc_[2]; - - Size valbuf_size, locbuf_size; - get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, - valbuf_size.height, locbuf_size.width, locbuf_size.height); - valbuf.create(valbuf_size, CV_8U); - locbuf.create(locbuf_size, CV_8U); - - if (mask.empty()) - { - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; - if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); - caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); - } - else - { - MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; - if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); - caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); - } - - if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } - if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } -} - -//////////////////////////////////////////////////////////////////////// -// Count non zero - -namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { - - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); - - template - int count_non_zero_caller(const DevMem2D src, PtrStep buf); - - template - int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf); - -}}}} - -int cv::gpu::countNonZero(const GpuMat& src) -{ - GpuMat buf; - return countNonZero(src, buf); -} - -int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) -{ - using namespace mathfunc::countnonzero; - - typedef int (*Caller)(const DevMem2D src, PtrStep buf); - - static const Caller callers[2][7] = - { { count_non_zero_multipass_caller, count_non_zero_multipass_caller, - count_non_zero_multipass_caller, count_non_zero_multipass_caller, - count_non_zero_multipass_caller, count_non_zero_multipass_caller, 0}, - { count_non_zero_caller, count_non_zero_caller, - count_non_zero_caller, count_non_zero_caller, - count_non_zero_caller, count_non_zero_caller, count_non_zero_caller } }; - - CV_Assert(src.channels() == 1); - CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); - - Size buf_size; - get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height); - buf.create(buf_size, CV_8U); - - Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; - if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); - return caller(src, buf); -} - -//////////////////////////////////////////////////////////////////////// // LUT void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) @@ -711,144 +343,4 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& } -////////////////////////////////////////////////////////////////////////////// -// min/max - -namespace cv { namespace gpu { namespace mathfunc -{ - template - void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); -}}} - -namespace -{ - template - void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) - { - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - dst.create(src1.size(), src1.type()); - mathfunc::min_gpu(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); - } - - template - void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) - { - dst.create(src1.size(), src1.type()); - mathfunc::min_gpu(src1.reshape(1), src2, dst.reshape(1), stream); - } - - template - void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) - { - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - dst.create(src1.size(), src1.type()); - mathfunc::max_gpu(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); - } - - template - void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) - { - dst.create(src1.size(), src1.type()); - mathfunc::max_gpu(src1.reshape(1), src2, dst.reshape(1), stream); - } -} - -void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) -{ - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) -{ - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); -} - -void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) -{ - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) -{ - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_caller, min_caller, min_caller, min_caller, min_caller, - min_caller, min_caller - }; - funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); -} - -void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) -{ - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) -{ - typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); -} - -void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) -{ - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, 0); -} - -void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) -{ - typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[] = - { - max_caller, max_caller, max_caller, max_caller, max_caller, - max_caller, max_caller - }; - funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); -} - - #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index ba9d011..cc44c64 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -345,4 +345,127 @@ namespace cv { namespace gpu { namespace mathfunc template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); template void bitwiseMaskXorCaller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + + ////////////////////////////////////////////////////////////////////////// + // min/max + + struct MinOp + { + template + __device__ T operator()(T a, T b) + { + return min(a, b); + } + __device__ float operator()(float a, float b) + { + return fmin(a, b); + } + __device__ double operator()(double a, double b) + { + return fmin(a, b); + } + }; + + struct MaxOp + { + template + __device__ T operator()(T a, T b) + { + return max(a, b); + } + __device__ float operator()(float a, float b) + { + return fmax(a, b); + } + __device__ double operator()(double a, double b) + { + return fmax(a, b); + } + }; + + struct ScalarMinOp + { + double s; + + explicit ScalarMinOp(double s_) : s(s_) {} + + template + __device__ T operator()(T a) + { + return saturate_cast(fmin((double)a, s)); + } + }; + + struct ScalarMaxOp + { + double s; + + explicit ScalarMaxOp(double s_) : s(s_) {} + + template + __device__ T operator()(T a) + { + return saturate_cast(fmax((double)a, s)); + } + }; + + template + void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) + { + MinOp op; + transform(src1, src2, dst, op, stream); + } + + template void min_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) + { + MaxOp op; + transform(src1, src2, dst, op, stream); + } + + template void max_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) + { + ScalarMinOp op(src2); + transform(src1, dst, op, stream); + } + + template void min_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) + { + ScalarMaxOp op(src2); + transform(src1, dst, op, stream); + } + + template void max_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); }}} diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index fe5a0e2..1f8d50c 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -58,49 +58,6 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace mathfunc { - template - __device__ void sum_in_smem(volatile T* data, const uint tid) - { - T sum = data[tid]; - - if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); } - if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); } - if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) data[tid] = sum = sum + data[tid + 32]; - if (size >= 32) data[tid] = sum = sum + data[tid + 16]; - if (size >= 16) data[tid] = sum = sum + data[tid + 8]; - if (size >= 8) data[tid] = sum = sum + data[tid + 4]; - if (size >= 4) data[tid] = sum = sum + data[tid + 2]; - if (size >= 2) data[tid] = sum = sum + data[tid + 1]; - } - } - - - struct Mask8U - { - explicit Mask8U(PtrStep mask): mask(mask) {} - - __device__ bool operator()(int y, int x) const - { - return mask.ptr(y)[x]; - } - - PtrStep mask; - }; - - - struct MaskTrue - { - __device__ bool operator()(int y, int x) const - { - return true; - } - }; - - struct Nothing { static __device__ void calc(int, int, float, float, float*, size_t, float) @@ -259,1676 +216,42 @@ namespace cv { namespace gpu { namespace mathfunc } -////////////////////////////////////////////////////////////////////////////// -// Min max - - // To avoid shared bank conflicts we convert each value into value of - // appropriate type (32 bits minimum) - template struct MinMaxTypeTraits {}; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef float best_type; }; - template <> struct MinMaxTypeTraits { typedef double best_type; }; - - - namespace minmax - { - - __constant__ int ctwidth; - __constant__ int ctheight; - - // Global counter of blocks finished its work - __device__ uint blocks_finished = 0; - - - // Estimates good thread configuration - // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = min(grid.x, threads.x); - grid.y = min(grid.y, threads.y); - } - - - // Returns required buffer sizes - void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * elem_size; - bufrows = 2; - } - - - // Estimates device constants which are used in the kernels using specified thread configuration - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); - } - - - // Does min and max in shared memory - template - __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) - { - minval[tid] = min(minval[tid], minval[tid + offset]); - maxval[tid] = max(maxval[tid], maxval[tid + offset]); - } - - - template - __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid) - { - if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } - if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } - if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) merge(tid, 32, minval, maxval); - if (size >= 32) merge(tid, 16, minval, maxval); - if (size >= 16) merge(tid, 8, minval, maxval); - if (size >= 8) merge(tid, 4, minval, maxval); - if (size >= 4) merge(tid, 2, minval, maxval); - if (size >= 2) merge(tid, 1, minval, maxval); - } - } - - - template - __global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - T mymin = numeric_limits_gpu::max(); - T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); - uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - for (uint y = y0; y < y_end; y += blockDim.y) - { - const T* src_row = (const T*)src.ptr(y); - for (uint x = x0; x < x_end; x += blockDim.x) - { - T val = src_row[x]; - if (mask(y, x)) - { - mymin = min(mymin, val); - mymax = max(mymax, val); - } - } - } - - sminval[tid] = mymin; - smaxval[tid] = mymax; - __syncthreads(); - - find_min_max_in_smem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - } - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - uint idx = min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - __syncthreads(); - - find_min_max_in_smem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - blocks_finished = 0; - } - } -#else - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - } -#endif - } - - - template - void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - } - - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - - - template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - } - - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*,double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - - - template - __global__ void min_max_pass2_kernel(T* minval, T* maxval, int size) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - uint idx = min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - __syncthreads(); - - find_min_max_in_smem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - } - } - - - template - void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); - min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - } - - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - - - template - void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); - min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - } - - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - - } // namespace minmax - -/////////////////////////////////////////////////////////////////////////////// -// minMaxLoc - - namespace minmaxloc { - - __constant__ int ctwidth; - __constant__ int ctheight; - - // Global counter of blocks finished its work - __device__ uint blocks_finished = 0; - - - // Estimates good thread configuration - // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = min(grid.x, threads.x); - grid.y = min(grid.y, threads.y); - } - - - // Returns required buffer sizes - void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, - int& b1rows, int& b2cols, int& b2rows) - { - dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); - b1cols = grid.x * grid.y * elem_size; // For values - b1rows = 2; - b2cols = grid.x * grid.y * sizeof(int); // For locations - b2rows = 2; - } - - - // Estimates device constants which are used in the kernels using specified thread configuration - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); - } - - - template - __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, - volatile uint* minloc, volatile uint* maxloc) - { - T val = minval[tid + offset]; - if (val < minval[tid]) - { - minval[tid] = val; - minloc[tid] = minloc[tid + offset]; - } - val = maxval[tid + offset]; - if (val > maxval[tid]) - { - maxval[tid] = val; - maxloc[tid] = maxloc[tid + offset]; - } - } - - - template - __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, - volatile uint* maxloc, const uint tid) - { - if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } - if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } - if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc); - if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc); - if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc); - if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc); - if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc); - if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc); - } - } - - - template - __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, - uint* minloc, uint* maxloc) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - __shared__ uint sminloc[nthreads]; - __shared__ uint smaxloc[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - T mymin = numeric_limits_gpu::max(); - T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); - uint myminloc = 0; - uint mymaxloc = 0; - uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - - for (uint y = y0; y < y_end; y += blockDim.y) - { - const T* ptr = (const T*)src.ptr(y); - for (uint x = x0; x < x_end; x += blockDim.x) - { - if (mask(y, x)) - { - T val = ptr[x]; - if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; } - if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; } - } - } - } - - sminval[tid] = mymin; - smaxval[tid] = mymax; - sminloc[tid] = myminloc; - smaxloc[tid] = mymaxloc; - __syncthreads(); - - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; - maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - uint idx = min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - sminloc[tid] = minloc[idx]; - smaxloc[tid] = maxloc[idx]; - __syncthreads(); - - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - minloc[0] = sminloc[0]; - maxloc[0] = smaxloc[0]; - blocks_finished = 0; - } - } -#else - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; - maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; - } -#endif - } - - - template - void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - - - template - void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - - - // This kernel will be used only when compute capability is 1.0 - template - __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - __shared__ uint sminloc[nthreads]; - __shared__ uint smaxloc[nthreads]; - - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - uint idx = min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - sminloc[tid] = minloc[idx]; - smaxloc[tid] = maxloc[idx]; - __syncthreads(); - - find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - minloc[0] = sminloc[0]; - maxloc[0] = smaxloc[0]; - } - } - - - template - void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - - - template - void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); - min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); - cudaSafeCall(cudaThreadSynchronize()); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - - } // namespace minmaxloc - ////////////////////////////////////////////////////////////////////////////////////////////////////////// -// countNonZero +// transpose - namespace countnonzero + __global__ void transpose(const DevMem2Di src, PtrStepi dst) { + __shared__ int s_mem[16 * 17]; - __constant__ int ctwidth; - __constant__ int ctheight; - - __device__ uint blocks_finished = 0; - - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = min(grid.x, threads.x); - grid.y = min(grid.y, threads.y); - } + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y; + if (y < src.rows && x < src.cols) + { + s_mem[smem_idx] = src.ptr(y)[x]; + } + __syncthreads(); - void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(int); - bufrows = 1; - } + smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x; + x = blockIdx.y * blockDim.x + threadIdx.x; + y = blockIdx.x * blockDim.y + threadIdx.y; - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); + if (y < src.cols && x < src.rows) + { + dst.ptr(y)[x] = s_mem[smem_idx]; + } } - - template - __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count) + void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst) { - __shared__ uint scount[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - uint cnt = 0; - for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - cnt += ptr[x0 + x * blockDim.x] != 0; - } - - scount[tid] = cnt; - __syncthreads(); - - sum_in_smem(scount, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; - __syncthreads(); - - sum_in_smem(scount, tid); - - if (tid == 0) - { - count[0] = scount[0]; - blocks_finished = 0; - } - } -#else - if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; -#endif - } - - - template - int count_non_zero_caller(const DevMem2D src, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - uint* count_buf = (uint*)buf.ptr(0); - - count_non_zero_kernel<256, T><<>>(src, count_buf); - cudaSafeCall(cudaThreadSynchronize()); - - uint count; - cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); - - return count; - } - - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); - - - template - __global__ void count_non_zero_pass2_kernel(uint* count, int size) - { - __shared__ uint scount[nthreads]; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - scount[tid] = tid < size ? count[tid] : 0; - __syncthreads(); - - sum_in_smem(scount, tid); - - if (tid == 0) - count[0] = scount[0]; - } - - - template - int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) - { - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - uint* count_buf = (uint*)buf.ptr(0); - - count_non_zero_kernel<256, T><<>>(src, count_buf); - count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); - cudaSafeCall(cudaThreadSynchronize()); - - uint count; - cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); - - return count; - } - - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - - } // namespace countnonzero - -////////////////////////////////////////////////////////////////////////////////////////////////////////// -// transpose - - __global__ void transpose(const DevMem2Di src, PtrStepi dst) - { - __shared__ int s_mem[16 * 17]; - - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y; - - if (y < src.rows && x < src.cols) - { - s_mem[smem_idx] = src.ptr(y)[x]; - } - __syncthreads(); - - smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x; - - x = blockIdx.y * blockDim.x + threadIdx.x; - y = blockIdx.x * blockDim.y + threadIdx.y; - - if (y < src.cols && x < src.rows) - { - dst.ptr(y)[x] = s_mem[smem_idx]; - } - } - - void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst) - { - dim3 threads(16, 16, 1); - dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1); + dim3 threads(16, 16, 1); + dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1); transpose<<>>(src, dst); cudaSafeCall( cudaThreadSynchronize() ); } - -////////////////////////////////////////////////////////////////////////////////////////////////////////// -// min/max - - struct MinOp - { - template - __device__ T operator()(T a, T b) - { - return min(a, b); - } - __device__ float operator()(float a, float b) - { - return fmin(a, b); - } - __device__ double operator()(double a, double b) - { - return fmin(a, b); - } - }; - - struct MaxOp - { - template - __device__ T operator()(T a, T b) - { - return max(a, b); - } - __device__ float operator()(float a, float b) - { - return fmax(a, b); - } - __device__ double operator()(double a, double b) - { - return fmax(a, b); - } - }; - - struct ScalarMinOp - { - double s; - - explicit ScalarMinOp(double s_) : s(s_) {} - - template - __device__ T operator()(T a) - { - return saturate_cast(fmin((double)a, s)); - } - }; - - struct ScalarMaxOp - { - double s; - - explicit ScalarMaxOp(double s_) : s(s_) {} - - template - __device__ T operator()(T a) - { - return saturate_cast(fmax((double)a, s)); - } - }; - - template - void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) - { - MinOp op; - transform(src1, src2, dst, op, stream); - } - - template void min_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream) - { - MaxOp op; - transform(src1, src2, dst, op, stream); - } - - template void max_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) - { - ScalarMinOp op(src2); - transform(src1, dst, op, stream); - } - - template void min_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - - template - void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) - { - ScalarMaxOp op(src2); - transform(src1, dst, op, stream); - } - - template void max_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - -////////////////////////////////////////////////////////////////////////////// -// Sum - - namespace sum - { - - template struct SumType {}; - template <> struct SumType { typedef uint R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef uint R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef float R; }; - template <> struct SumType { typedef double R; }; - - template - struct IdentityOp { static __device__ R call(R x) { return x; } }; - - template - struct SqrOp { static __device__ R call(R x) { return x * x; } }; - - __constant__ int ctwidth; - __constant__ int ctheight; - __device__ uint blocks_finished = 0; - - const int threads_x = 32; - const int threads_y = 8; - - void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(threads_x, threads_y); - grid = dim3(divUp(cols, threads.x * threads.y), - divUp(rows, threads.y * threads.x)); - grid.x = min(grid.x, threads.x); - grid.y = min(grid.y, threads.y); - } - - - void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimate_thread_cfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(double) * cn; - bufrows = 1; - } - - - void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); - } - - template - __global__ void sum_kernel(const DevMem2D src, R* result) - { - __shared__ R smem[nthreads]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - R sum = 0; - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - sum += Op::call(ptr[x0 + x * blockDim.x]); - } - - smem[tid] = sum; - __syncthreads(); - - sum_in_smem(smem, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - result[bid] = smem[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; - __syncthreads(); - - sum_in_smem(smem, tid); - - if (tid == 0) - { - result[0] = smem[0]; - blocks_finished = 0; - } - } -#else - if (tid == 0) result[bid] = smem[0]; -#endif - } - - - template - __global__ void sum_pass2_kernel(R* result, int size) - { - __shared__ R smem[nthreads]; - int tid = threadIdx.y * blockDim.x + threadIdx.x; - - smem[tid] = tid < size ? result[tid] : 0; - __syncthreads(); - - sum_in_smem(smem, tid); - - if (tid == 0) - result[0] = smem[0]; - } - - - template - __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec::vec_t* result) - { - typedef typename TypeVec::vec_t SrcType; - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 2]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - result[0] = res; - blocks_finished = 0; - } - } -#else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - result[bid] = res; - } -#endif - } - - - template - __global__ void sum_pass2_kernel_C2(typename TypeVec::vec_t* result, int size) - { - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 2]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - result[0] = res; - } - } - - - template - __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec::vec_t* result) - { - typedef typename TypeVec::vec_t SrcType; - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 3]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - smem[tid + 2 * nthreads] = sum.z; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[0] = res; - blocks_finished = 0; - } - } -#else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[bid] = res; - } -#endif - } - - - template - __global__ void sum_pass2_kernel_C3(typename TypeVec::vec_t* result, int size) - { - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 3]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[0] = res; - } - } - - template - __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec::vec_t* result) - { - typedef typename TypeVec::vec_t SrcType; - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 4]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), - Op::call(val.z), Op::call(val.w)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - smem[tid + 2 * nthreads] = sum.z; - smem[tid + 3 * nthreads] = sum.w; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - smem[tid + 3 * nthreads] = res.w; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[0] = res; - blocks_finished = 0; - } - } -#else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[bid] = res; - } -#endif - } - - - template - __global__ void sum_pass2_kernel_C4(typename TypeVec::vec_t* result, int size) - { - typedef typename TypeVec::vec_t DstType; - - __shared__ R smem[nthreads * 4]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - smem[tid + 3 * nthreads] = res.z; - __syncthreads(); - - sum_in_smem(smem, tid); - sum_in_smem(smem + nthreads, tid); - sum_in_smem(smem + 2 * nthreads, tid); - sum_in_smem(smem + 3 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[0] = res; - } - } - - } // namespace sum - - - template - void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) - { - using namespace sum; - typedef typename SumType::R R; - - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - case 2: - sum_kernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - case 3: - sum_kernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - case 4: - sum_kernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - } - cudaSafeCall(cudaThreadSynchronize()); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - - - template - void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) - { - using namespace sum; - typedef typename SumType::R R; - - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 2: - sum_kernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 3: - sum_kernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 4: - sum_kernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - } - cudaSafeCall(cudaThreadSynchronize()); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); - - - template - void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) - { - using namespace sum; - typedef typename SumType::R R; - - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - break; - case 2: - sum_kernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - break; - case 3: - sum_kernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - break; - case 4: - sum_kernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); - break; - } - cudaSafeCall(cudaThreadSynchronize()); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - - - template - void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) - { - using namespace sum; - typedef typename SumType::R R; - - dim3 threads, grid; - estimate_thread_cfg(src.cols, src.rows, threads, grid); - set_kernel_consts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sum_kernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 2: - sum_kernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 3: - sum_kernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - case 4: - sum_kernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_t*)buf.ptr(0)); - break; - } - cudaSafeCall(cudaThreadSynchronize()); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); }}} diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index e69de29..b16b495 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -0,0 +1,1609 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/limits_gpu.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vecmath.hpp" +#include "transform.hpp" +#include "internal_shared.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace cv { namespace gpu { namespace mathfunc +{ + + // Performs reduction in shared memory + template + __device__ void sum_in_smem(volatile T* data, const uint tid) + { + T sum = data[tid]; + + if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); } + if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); } + if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); } + + if (tid < 32) + { + if (size >= 64) data[tid] = sum = sum + data[tid + 32]; + if (size >= 32) data[tid] = sum = sum + data[tid + 16]; + if (size >= 16) data[tid] = sum = sum + data[tid + 8]; + if (size >= 8) data[tid] = sum = sum + data[tid + 4]; + if (size >= 4) data[tid] = sum = sum + data[tid + 2]; + if (size >= 2) data[tid] = sum = sum + data[tid + 1]; + } + } + + + struct Mask8U + { + explicit Mask8U(PtrStep mask): mask(mask) {} + + __device__ bool operator()(int y, int x) const + { + return mask.ptr(y)[x]; + } + + PtrStep mask; + }; + + + struct MaskTrue + { + __device__ bool operator()(int y, int x) const + { + return true; + } + }; + + ////////////////////////////////////////////////////////////////////////////// + // Min max + + // To avoid shared bank conflicts we convert each value into value of + // appropriate type (32 bits minimum) + template struct MinMaxTypeTraits {}; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef float best_type; }; + template <> struct MinMaxTypeTraits { typedef double best_type; }; + + + namespace minmax + { + + __constant__ int ctwidth; + __constant__ int ctheight; + + // Global counter of blocks finished its work + __device__ uint blocks_finished = 0; + + + // Estimates good thread configuration + // - threads variable satisfies to threads.x * threads.y == 256 + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + { + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); + } + + + // Returns required buffer sizes + void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows) + { + dim3 threads, grid; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * elem_size; + bufrows = 2; + } + + + // Estimates device constants which are used in the kernels using specified thread configuration + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); + } + + + // Does min and max in shared memory + template + __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) + { + minval[tid] = min(minval[tid], minval[tid + offset]); + maxval[tid] = max(maxval[tid], maxval[tid + offset]); + } + + + template + __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid) + { + if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } + if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } + if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); } + + if (tid < 32) + { + if (size >= 64) merge(tid, 32, minval, maxval); + if (size >= 32) merge(tid, 16, minval, maxval); + if (size >= 16) merge(tid, 8, minval, maxval); + if (size >= 8) merge(tid, 4, minval, maxval); + if (size >= 4) merge(tid, 2, minval, maxval); + if (size >= 2) merge(tid, 1, minval, maxval); + } + } + + + template + __global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval) + { + typedef typename MinMaxTypeTraits::best_type best_type; + __shared__ best_type sminval[nthreads]; + __shared__ best_type smaxval[nthreads]; + + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + T mymin = numeric_limits_gpu::max(); + T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); + uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); + uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); + for (uint y = y0; y < y_end; y += blockDim.y) + { + const T* src_row = (const T*)src.ptr(y); + for (uint x = x0; x < x_end; x += blockDim.x) + { + T val = src_row[x]; + if (mask(y, x)) + { + mymin = min(mymin, val); + mymax = max(mymax, val); + } + } + } + + sminval[tid] = mymin; + smaxval[tid] = mymax; + __syncthreads(); + + find_min_max_in_smem(sminval, smaxval, tid); + + if (tid == 0) + { + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = ticket == gridDim.x * gridDim.y - 1; + } + + __syncthreads(); + + if (is_last) + { + uint idx = min(tid, gridDim.x * gridDim.y - 1); + + sminval[tid] = minval[idx]; + smaxval[tid] = maxval[idx]; + __syncthreads(); + + find_min_max_in_smem(sminval, smaxval, tid); + + if (tid == 0) + { + minval[0] = (T)sminval[0]; + maxval[0] = (T)smaxval[0]; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + } +#endif + } + + + template + void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + + min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + } + + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + + + template + void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + + min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + } + + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*,double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + + + template + __global__ void min_max_pass2_kernel(T* minval, T* maxval, int size) + { + typedef typename MinMaxTypeTraits::best_type best_type; + __shared__ best_type sminval[nthreads]; + __shared__ best_type smaxval[nthreads]; + + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + uint idx = min(tid, gridDim.x * gridDim.y - 1); + + sminval[tid] = minval[idx]; + smaxval[tid] = maxval[idx]; + __syncthreads(); + + find_min_max_in_smem(sminval, smaxval, tid); + + if (tid == 0) + { + minval[0] = (T)sminval[0]; + maxval[0] = (T)smaxval[0]; + } + } + + + template + void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + + min_max_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); + min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + } + + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + + + template + void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + + min_max_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); + min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + } + + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + + } // namespace minmax + +/////////////////////////////////////////////////////////////////////////////// +// minMaxLoc + + namespace minmaxloc { + + __constant__ int ctwidth; + __constant__ int ctheight; + + // Global counter of blocks finished its work + __device__ uint blocks_finished = 0; + + + // Estimates good thread configuration + // - threads variable satisfies to threads.x * threads.y == 256 + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + { + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); + } + + + // Returns required buffer sizes + void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + int& b1rows, int& b2cols, int& b2rows) + { + dim3 threads, grid; + estimate_thread_cfg(cols, rows, threads, grid); + b1cols = grid.x * grid.y * elem_size; // For values + b1rows = 2; + b2cols = grid.x * grid.y * sizeof(int); // For locations + b2rows = 2; + } + + + // Estimates device constants which are used in the kernels using specified thread configuration + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); + } + + + template + __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, + volatile uint* minloc, volatile uint* maxloc) + { + T val = minval[tid + offset]; + if (val < minval[tid]) + { + minval[tid] = val; + minloc[tid] = minloc[tid + offset]; + } + val = maxval[tid + offset]; + if (val > maxval[tid]) + { + maxval[tid] = val; + maxloc[tid] = maxloc[tid + offset]; + } + } + + + template + __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, + volatile uint* maxloc, const uint tid) + { + if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } + if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } + if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); } + + if (tid < 32) + { + if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc); + if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc); + if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc); + if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc); + if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc); + if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc); + } + } + + + template + __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, + uint* minloc, uint* maxloc) + { + typedef typename MinMaxTypeTraits::best_type best_type; + __shared__ best_type sminval[nthreads]; + __shared__ best_type smaxval[nthreads]; + __shared__ uint sminloc[nthreads]; + __shared__ uint smaxloc[nthreads]; + + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + T mymin = numeric_limits_gpu::max(); + T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); + uint myminloc = 0; + uint mymaxloc = 0; + uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); + uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); + + for (uint y = y0; y < y_end; y += blockDim.y) + { + const T* ptr = (const T*)src.ptr(y); + for (uint x = x0; x < x_end; x += blockDim.x) + { + if (mask(y, x)) + { + T val = ptr[x]; + if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; } + if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; } + } + } + } + + sminval[tid] = mymin; + smaxval[tid] = mymax; + sminloc[tid] = myminloc; + smaxloc[tid] = mymaxloc; + __syncthreads(); + + find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; + maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = ticket == gridDim.x * gridDim.y - 1; + } + + __syncthreads(); + + if (is_last) + { + uint idx = min(tid, gridDim.x * gridDim.y - 1); + + sminval[tid] = minval[idx]; + smaxval[tid] = maxval[idx]; + sminloc[tid] = minloc[idx]; + smaxloc[tid] = maxloc[idx]; + __syncthreads(); + + find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + + if (tid == 0) + { + minval[0] = (T)sminval[0]; + maxval[0] = (T)smaxval[0]; + minloc[0] = sminloc[0]; + maxloc[0] = smaxloc[0]; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; + maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; + } +#endif + } + + + template + void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); + + min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + + uint minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } + + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + + + template + void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); + + min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + + uint minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } + + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + + + // This kernel will be used only when compute capability is 1.0 + template + __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) + { + typedef typename MinMaxTypeTraits::best_type best_type; + __shared__ best_type sminval[nthreads]; + __shared__ best_type smaxval[nthreads]; + __shared__ uint sminloc[nthreads]; + __shared__ uint smaxloc[nthreads]; + + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + uint idx = min(tid, gridDim.x * gridDim.y - 1); + + sminval[tid] = minval[idx]; + smaxval[tid] = maxval[idx]; + sminloc[tid] = minloc[idx]; + smaxloc[tid] = maxloc[idx]; + __syncthreads(); + + find_min_max_loc_in_smem(sminval, smaxval, sminloc, smaxloc, tid); + + if (tid == 0) + { + minval[0] = (T)sminval[0]; + maxval[0] = (T)smaxval[0]; + minloc[0] = sminloc[0]; + maxloc[0] = smaxloc[0]; + } + } + + + template + void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); + + min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + + uint minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } + + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + + + template + void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); + + min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); + min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + + uint minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } + + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + + } // namespace minmaxloc + +////////////////////////////////////////////////////////////////////////////////////////////////////////// +// countNonZero + + namespace countnonzero + { + + __constant__ int ctwidth; + __constant__ int ctheight; + + __device__ uint blocks_finished = 0; + + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + { + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); + } + + + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) + { + dim3 threads, grid; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * sizeof(int); + bufrows = 1; + } + + + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); + } + + + template + __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count) + { + __shared__ uint scount[nthreads]; + + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + uint cnt = 0; + for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); + for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + cnt += ptr[x0 + x * blockDim.x] != 0; + } + + scount[tid] = cnt; + __syncthreads(); + + sum_in_smem(scount, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = ticket == gridDim.x * gridDim.y - 1; + } + + __syncthreads(); + + if (is_last) + { + scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; + __syncthreads(); + + sum_in_smem(scount, tid); + + if (tid == 0) + { + count[0] = scount[0]; + blocks_finished = 0; + } + } +#else + if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; +#endif + } + + + template + int count_non_zero_caller(const DevMem2D src, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + uint* count_buf = (uint*)buf.ptr(0); + + count_non_zero_kernel<256, T><<>>(src, count_buf); + cudaSafeCall(cudaThreadSynchronize()); + + uint count; + cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); + + return count; + } + + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + + + template + __global__ void count_non_zero_pass2_kernel(uint* count, int size) + { + __shared__ uint scount[nthreads]; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + scount[tid] = tid < size ? count[tid] : 0; + __syncthreads(); + + sum_in_smem(scount, tid); + + if (tid == 0) + count[0] = scount[0]; + } + + + template + int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + uint* count_buf = (uint*)buf.ptr(0); + + count_non_zero_kernel<256, T><<>>(src, count_buf); + count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); + + uint count; + cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); + + return count; + } + + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + + } // namespace countnonzero + + + ////////////////////////////////////////////////////////////////////////// + // Sum + + namespace sum + { + + template struct SumType {}; + template <> struct SumType { typedef uint R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef uint R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef int R; }; + template <> struct SumType { typedef float R; }; + template <> struct SumType { typedef double R; }; + + template + struct IdentityOp { static __device__ R call(R x) { return x; } }; + + template + struct SqrOp { static __device__ R call(R x) { return x * x; } }; + + __constant__ int ctwidth; + __constant__ int ctheight; + __device__ uint blocks_finished = 0; + + const int threads_x = 32; + const int threads_y = 8; + + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) + { + threads = dim3(threads_x, threads_y); + grid = dim3(divUp(cols, threads.x * threads.y), + divUp(rows, threads.y * threads.x)); + grid.x = min(grid.x, threads.x); + grid.y = min(grid.y, threads.y); + } + + + void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows) + { + dim3 threads, grid; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * sizeof(double) * cn; + bufrows = 1; + } + + + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); + } + + template + __global__ void sum_kernel(const DevMem2D src, R* result) + { + __shared__ R smem[nthreads]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + R sum = 0; + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + sum += Op::call(ptr[x0 + x * blockDim.x]); + } + + smem[tid] = sum; + __syncthreads(); + + sum_in_smem(smem, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + result[bid] = smem[0]; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; + __syncthreads(); + + sum_in_smem(smem, tid); + + if (tid == 0) + { + result[0] = smem[0]; + blocks_finished = 0; + } + } +#else + if (tid == 0) result[bid] = smem[0]; +#endif + } + + + template + __global__ void sum_pass2_kernel(R* result, int size) + { + __shared__ R smem[nthreads]; + int tid = threadIdx.y * blockDim.x + threadIdx.x; + + smem[tid] = tid < size ? result[tid] : 0; + __syncthreads(); + + sum_in_smem(smem, tid); + + if (tid == 0) + result[0] = smem[0]; + } + + + template + __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 2]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + result[bid] = res; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C2(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 2]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + result[0] = res; + } + } + + + template + __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 3]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + smem[tid + 2 * nthreads] = sum.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[bid] = res; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C3(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 3]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + result[0] = res; + } + } + + template + __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec::vec_t* result) + { + typedef typename TypeVec::vec_t SrcType; + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 4]; + + const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const int bid = blockIdx.y * gridDim.x + blockIdx.x; + + SrcType val; + DstType sum = VecTraits::all(0); + for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); + for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), + Op::call(val.z), Op::call(val.w)); + } + } + + smem[tid] = sum.x; + smem[tid + nthreads] = sum.y; + smem[tid + 2 * nthreads] = sum.z; + smem[tid + 3 * nthreads] = sum.w; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + __shared__ bool is_last; + + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[bid] = res; + __threadfence(); + + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = (ticket == gridDim.x * gridDim.y - 1); + } + + __syncthreads(); + + if (is_last) + { + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + smem[tid + 3 * nthreads] = res.w; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[0] = res; + blocks_finished = 0; + } + } +#else + if (tid == 0) + { + DstType res; + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[bid] = res; + } +#endif + } + + + template + __global__ void sum_pass2_kernel_C4(typename TypeVec::vec_t* result, int size) + { + typedef typename TypeVec::vec_t DstType; + + __shared__ R smem[nthreads * 4]; + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); + smem[tid] = res.x; + smem[tid + nthreads] = res.y; + smem[tid + 2 * nthreads] = res.z; + smem[tid + 3 * nthreads] = res.z; + __syncthreads(); + + sum_in_smem(smem, tid); + sum_in_smem(smem + nthreads, tid); + sum_in_smem(smem + 2 * nthreads, tid); + sum_in_smem(smem + 3 * nthreads, tid); + + if (tid == 0) + { + res.x = smem[0]; + res.y = smem[nthreads]; + res.z = smem[2 * nthreads]; + res.w = smem[3 * nthreads]; + result[0] = res; + } + } + + } // namespace sum + + + template + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + } + cudaSafeCall(cudaThreadSynchronize()); + + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; + } + + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + + + template + void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + } + cudaSafeCall(cudaThreadSynchronize()); + + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; + } + + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); + + + template + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C2<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C3<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + sum_pass2_kernel_C4<<<1, threads_x * threads_y>>>( + (typename TypeVec::vec_t*)buf.ptr(0), grid.x * grid.y); + break; + } + cudaSafeCall(cudaThreadSynchronize()); + + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; + } + + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + + + template + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) + { + using namespace sum; + typedef typename SumType::R R; + + dim3 threads, grid; + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); + + switch (cn) + { + case 1: + sum_kernel, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 2: + sum_kernel_C2, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 3: + sum_kernel_C3, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + case 4: + sum_kernel_C4, threads_x * threads_y><<>>( + src, (typename TypeVec::vec_t*)buf.ptr(0)); + break; + } + cudaSafeCall(cudaThreadSynchronize()); + + R result[4] = {0, 0, 0, 0}; + cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); + + sum[0] = result[0]; + sum[1] = result[1]; + sum[2] = result[2]; + sum[3] = result[3]; + } + + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + }}} \ No newline at end of file diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 6ad18a1..78c9b6d 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -66,10 +66,14 @@ void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } -cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuMat(); } -cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); } -cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); } -cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); } +void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } +void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); } +void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); } +void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); } #else @@ -574,4 +578,144 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c ::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); } + +////////////////////////////////////////////////////////////////////////////// +// Minimum and maximum operations + +namespace cv { namespace gpu { namespace mathfunc +{ + template + void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + + template + void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); +}}} + +namespace +{ + template + void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) + { + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + dst.create(src1.size(), src1.type()); + mathfunc::min_gpu(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); + } + + template + void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) + { + dst.create(src1.size(), src1.type()); + mathfunc::min_gpu(src1.reshape(1), src2, dst.reshape(1), stream); + } + + template + void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) + { + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + dst.create(src1.size(), src1.type()); + mathfunc::max_gpu(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); + } + + template + void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) + { + dst.create(src1.size(), src1.type()); + mathfunc::max_gpu(src1.reshape(1), src2, dst.reshape(1), stream); + } +} + +void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + min_caller, min_caller, min_caller, min_caller, min_caller, + min_caller, min_caller + }; + funcs[src1.depth()](src1, src2, dst, 0); +} + +void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + min_caller, min_caller, min_caller, min_caller, min_caller, + min_caller, min_caller + }; + funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) +{ + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + min_caller, min_caller, min_caller, min_caller, min_caller, + min_caller, min_caller + }; + funcs[src1.depth()](src1, src2, dst, 0); +} + +void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + min_caller, min_caller, min_caller, min_caller, min_caller, + min_caller, min_caller + }; + funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + max_caller, max_caller, max_caller, max_caller, max_caller, + max_caller, max_caller + }; + funcs[src1.depth()](src1, src2, dst, 0); +} + +void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + max_caller, max_caller, max_caller, max_caller, max_caller, + max_caller, max_caller + }; + funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) +{ + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + max_caller, max_caller, max_caller, max_caller, max_caller, + max_caller, max_caller + }; + funcs[src1.depth()](src1, src2, dst, 0); +} + +void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[] = + { + max_caller, max_caller, max_caller, max_caller, max_caller, + max_caller, max_caller + }; + funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); +} + #endif \ No newline at end of file diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp index e69de29..4fff221 100644 --- a/modules/gpu/src/matrix_reductions.cpp +++ b/modules/gpu/src/matrix_reductions.cpp @@ -0,0 +1,423 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; + +#if !defined (HAVE_CUDA) + +void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); } +double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; } +double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; } +Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); } +Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); } +void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); } +void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); } +void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } +int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; } +int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; } + +#else + + +//////////////////////////////////////////////////////////////////////// +// meanStdDev + +void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) +{ + CV_Assert(src.type() == CV_8UC1); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), src.step, sz, mean.val, stddev.val) ); +} + + +//////////////////////////////////////////////////////////////////////// +// norm + +double cv::gpu::norm(const GpuMat& src1, int normType) +{ + return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType); +} + +double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) +{ + CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); + + CV_Assert(src1.type() == CV_8UC1); + CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2); + + typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, + NppiSize oSizeROI, Npp64f* pRetVal); + + static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R}; + + NppiSize sz; + sz.width = src1.cols; + sz.height = src1.rows; + + int funcIdx = normType >> 1; + double retVal; + + nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr(), src1.step, + src2.ptr(), src2.step, + sz, &retVal) ); + + return retVal; +} + +//////////////////////////////////////////////////////////////////////// +// Sum + +namespace cv { namespace gpu { namespace mathfunc +{ + template + void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + + template + void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + + template + void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + + template + void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); + + namespace sum + { + void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); + } +}}} + + +Scalar cv::gpu::sum(const GpuMat& src) +{ + GpuMat buf; + return sum(src, buf); +} + + +Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc; + + typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); + static const Caller callers[2][7] = + { { sum_multipass_caller, sum_multipass_caller, + sum_multipass_caller, sum_multipass_caller, + sum_multipass_caller, sum_multipass_caller, 0 }, + { sum_caller, sum_caller, + sum_caller, sum_caller, + sum_caller, sum_caller, 0 } }; + + Size bufSize; + sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); + + Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; + if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); + + double result[4]; + caller(src, buf, result, src.channels()); + return Scalar(result[0], result[1], result[2], result[3]); +} + + +Scalar cv::gpu::sqrSum(const GpuMat& src) +{ + GpuMat buf; + return sqrSum(src, buf); +} + + +Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc; + + typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); + static const Caller callers[2][7] = + { { sqsum_multipass_caller, sqsum_multipass_caller, + sqsum_multipass_caller, sqsum_multipass_caller, + sqsum_multipass_caller, sqsum_multipass_caller, 0 }, + { sqsum_caller, sqsum_caller, + sqsum_caller, sqsum_caller, + sqsum_caller, sqsum_caller, 0 } }; + + Size bufSize; + sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); + + Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; + if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); + + double result[4]; + caller(src, buf, result, src.channels()); + return Scalar(result[0], result[1], result[2], result[3]); +} + +//////////////////////////////////////////////////////////////////////// +// Find min or max + +namespace cv { namespace gpu { namespace mathfunc { namespace minmax { + + void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); + + template + void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); + + template + void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); + + template + void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); + + template + void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); + +}}}} + + +void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask) +{ + GpuMat buf; + minMax(src, minVal, maxVal, mask, buf); +} + + +void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) +{ + using namespace mathfunc::minmax; + + typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); + typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); + + static const Caller callers[2][7] = + { { min_max_multipass_caller, min_max_multipass_caller, + min_max_multipass_caller, min_max_multipass_caller, + min_max_multipass_caller, min_max_multipass_caller, 0 }, + { min_max_caller, min_max_caller, + min_max_caller, min_max_caller, + min_max_caller, min_max_caller, min_max_caller } }; + + static const MaskedCaller masked_callers[2][7] = + { { min_max_mask_multipass_caller, min_max_mask_multipass_caller, + min_max_mask_multipass_caller, min_max_mask_multipass_caller, + min_max_mask_multipass_caller, min_max_mask_multipass_caller, 0 }, + { min_max_mask_caller, min_max_mask_caller, + min_max_mask_caller, min_max_mask_caller, + min_max_mask_caller, min_max_mask_caller, + min_max_mask_caller } }; + + + CV_Assert(src.channels() == 1); + CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); + CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); + + double minVal_; if (!minVal) minVal = &minVal_; + double maxVal_; if (!maxVal) maxVal = &maxVal_; + + Size bufSize; + get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); + + if (mask.empty()) + { + Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); + caller(src, minVal, maxVal, buf); + } + else + { + MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); + caller(src, mask, minVal, maxVal, buf); + } +} + + +//////////////////////////////////////////////////////////////////////// +// Locate min and max + +namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { + + void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + int& b1rows, int& b2cols, int& b2rows); + + template + void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); + + template + void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); + + template + void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); + + template + void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); +}}}} + + +void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask) +{ + GpuMat valbuf, locbuf; + minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf); +} + + +void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf) +{ + using namespace mathfunc::minmaxloc; + + typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + + static const Caller callers[2][7] = + { { min_max_loc_multipass_caller, min_max_loc_multipass_caller, + min_max_loc_multipass_caller, min_max_loc_multipass_caller, + min_max_loc_multipass_caller, min_max_loc_multipass_caller, 0 }, + { min_max_loc_caller, min_max_loc_caller, + min_max_loc_caller, min_max_loc_caller, + min_max_loc_caller, min_max_loc_caller, min_max_loc_caller } }; + + static const MaskedCaller masked_callers[2][7] = + { { min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, + min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, + min_max_loc_mask_multipass_caller, min_max_loc_mask_multipass_caller, 0 }, + { min_max_loc_mask_caller, min_max_loc_mask_caller, + min_max_loc_mask_caller, min_max_loc_mask_caller, + min_max_loc_mask_caller, min_max_loc_mask_caller, min_max_loc_mask_caller } }; + + CV_Assert(src.channels() == 1); + CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); + CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); + + double minVal_; if (!minVal) minVal = &minVal_; + double maxVal_; if (!maxVal) maxVal = &maxVal_; + int minLoc_[2]; + int maxLoc_[2]; + + Size valbuf_size, locbuf_size; + get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, + valbuf_size.height, locbuf_size.width, locbuf_size.height); + valbuf.create(valbuf_size, CV_8U); + locbuf.create(locbuf_size, CV_8U); + + if (mask.empty()) + { + Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); + caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); + } + else + { + MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); + caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); + } + + if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } + if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } +} + +////////////////////////////////////////////////////////////////////////////// +// Count non-zero elements + +namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { + + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); + + template + int count_non_zero_caller(const DevMem2D src, PtrStep buf); + + template + int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf); + +}}}} + + +int cv::gpu::countNonZero(const GpuMat& src) +{ + GpuMat buf; + return countNonZero(src, buf); +} + + +int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc::countnonzero; + + typedef int (*Caller)(const DevMem2D src, PtrStep buf); + + static const Caller callers[2][7] = + { { count_non_zero_multipass_caller, count_non_zero_multipass_caller, + count_non_zero_multipass_caller, count_non_zero_multipass_caller, + count_non_zero_multipass_caller, count_non_zero_multipass_caller, 0}, + { count_non_zero_caller, count_non_zero_caller, + count_non_zero_caller, count_non_zero_caller, + count_non_zero_caller, count_non_zero_caller, count_non_zero_caller } }; + + CV_Assert(src.channels() == 1); + CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); + + Size buf_size; + get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height); + buf.create(buf_size, CV_8U); + + Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; + if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); + return caller(src, buf); +} + +#endif -- 2.7.4