From b705e0d886f120799d5470986acfc25dc049b98a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 27 Aug 2013 12:27:36 +0400 Subject: [PATCH] used new device layer for cv::gpu::sum --- modules/cudaarithm/src/cuda/sum.cu | 414 ++++++--------------- modules/cudaarithm/src/reductions.cpp | 131 ------- modules/cudev/CMakeLists.txt | 2 +- .../include/opencv2/cudev/grid/detail/reduce.hpp | 9 +- .../cudev/include/opencv2/cudev/grid/reduce.hpp | 8 + .../cudev/include/opencv2/cudev/util/vec_math.hpp | 17 +- 6 files changed, 145 insertions(+), 436 deletions(-) diff --git a/modules/cudaarithm/src/cuda/sum.cu b/modules/cudaarithm/src/cuda/sum.cu index cd8c3a1..ab90050 100644 --- a/modules/cudaarithm/src/cuda/sum.cu +++ b/modules/cudaarithm/src/cuda/sum.cu @@ -40,342 +40,164 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" -#include "opencv2/core/cuda/vec_math.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/reduce.hpp" -#include "opencv2/core/cuda/emulation.hpp" -#include "opencv2/core/cuda/utility.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "unroll_detail.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace sum -{ - __device__ unsigned int blocks_finished = 0; +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" - template struct AtomicAdd; - template struct AtomicAdd - { - static __device__ void run(R* ptr, R val) - { - Emulation::glob::atomicAdd(ptr, val); - } - }; - template struct AtomicAdd - { - typedef typename TypeVec::vec_type val_type; +using namespace cv::cudev; - static __device__ void run(R* ptr, val_type val) - { - Emulation::glob::atomicAdd(ptr, val.x); - Emulation::glob::atomicAdd(ptr + 1, val.y); - } - }; - template struct AtomicAdd - { - typedef typename TypeVec::vec_type val_type; - - static __device__ void run(R* ptr, val_type val) - { - Emulation::glob::atomicAdd(ptr, val.x); - Emulation::glob::atomicAdd(ptr + 1, val.y); - Emulation::glob::atomicAdd(ptr + 2, val.z); - } - }; - template struct AtomicAdd +namespace +{ + template + cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) { - typedef typename TypeVec::vec_type val_type; - - static __device__ void run(R* ptr, val_type val) - { - Emulation::glob::atomicAdd(ptr, val.x); - Emulation::glob::atomicAdd(ptr + 1, val.y); - Emulation::glob::atomicAdd(ptr + 2, val.z); - Emulation::glob::atomicAdd(ptr + 3, val.w); - } - }; + typedef typename MakeVec::type src_type; + typedef typename MakeVec::type res_type; - template - struct GlobalReduce - { - typedef typename TypeVec::vec_type result_type; - - static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem) - { - #if __CUDA_ARCH__ >= 200 - if (tid == 0) - AtomicAdd::run((R*) result, sum); - #else - __shared__ bool is_last; - - if (tid == 0) - { - result[bid] = sum; - - __threadfence(); - - unsigned int ticket = ::atomicAdd(&blocks_finished, 1); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - - device::reduce(detail::Unroll::template smem_tuple(smem), detail::Unroll::tie(sum), tid, detail::Unroll::op(plus())); - - if (tid == 0) - { - result[0] = sum; - blocks_finished = 0; - } - } - #endif - } - }; + GpuMat_ src(_src); + GpuMat_ buf(_buf); - template - __global__ void kernel(const PtrStepSz src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight) - { - typedef typename VecTraits::elem_type T; - typedef typename VecTraits::elem_type R; - const int cn = VecTraits::cn; - - __shared__ R smem[BLOCK_SIZE * cn]; + if (mask.empty()) + gridCalcSum(src, buf); + else + gridCalcSum(src, buf, globPtr(mask)); - const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y; + cv::Scalar_ res; + cv::Mat res_mat(buf.size(), buf.type(), res.val); + buf.download(res_mat); - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; + return res; + } - result_type sum = VecTraits::all(0); + template + cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) + { + typedef typename MakeVec::type src_type; + typedef typename MakeVec::type res_type; - for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y) - { - const src_type* ptr = src.ptr(y); + GpuMat_ src(_src); + GpuMat_ buf(_buf); - for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x) - { - if (mask(y, x)) - { - const src_type srcVal = ptr[x]; - sum = sum + op(saturate_cast(srcVal)); - } - } - } + if (mask.empty()) + gridCalcSum(abs_(cvt_(src)), buf); + else + gridCalcSum(abs_(cvt_(src)), buf, globPtr(mask)); - device::reduce(detail::Unroll::template smem_tuple(smem), detail::Unroll::tie(sum), tid, detail::Unroll::op(plus())); + cv::Scalar_ res; + cv::Mat res_mat(buf.size(), buf.type(), res.val); + buf.download(res_mat); - GlobalReduce::run(sum, result, tid, bid, smem); + return res; } - const int threads_x = 32; - const int threads_y = 8; - - void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid) + template + cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) { - block = dim3(threads_x, threads_y); + typedef typename MakeVec::type src_type; + typedef typename MakeVec::type res_type; - grid = dim3(divUp(cols, block.x * block.y), - divUp(rows, block.y * block.x)); + GpuMat_ src(_src); + GpuMat_ buf(_buf); - grid.x = ::min(grid.x, block.x); - grid.y = ::min(grid.y, block.y); - } + if (mask.empty()) + gridCalcSum(sqr_(cvt_(src)), buf); + else + gridCalcSum(sqr_(cvt_(src)), buf, globPtr(mask)); - void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows) - { - dim3 block, grid; - getLaunchCfg(cols, rows, block, grid); + cv::Scalar_ res; + cv::Mat res_mat(buf.size(), buf.type(), res.val); + buf.download(res_mat); - bufcols = grid.x * grid.y * sizeof(double) * cn; - bufrows = 1; + return res; } +} - template class Op> - void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask) +cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf) +{ + typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + static const func_t funcs[7][4] = { - typedef typename TypeVec::vec_type src_type; - typedef typename TypeVec::vec_type result_type; + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl} + }; - PtrStepSz src(src_); - result_type* buf = (result_type*) buf_; + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); - dim3 block, grid; - getLaunchCfg(src.cols, src.rows, block, grid); + CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - const int twidth = divUp(divUp(src.cols, grid.x), block.x); - const int theight = divUp(divUp(src.rows, grid.y), block.y); + const int res_depth = std::max(src.depth(), CV_32F); + cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf); - Op op; + const func_t func = funcs[src.depth()][src.channels() - 1]; - if (mask.data) - kernel<<>>(src, buf, SingleMask(mask), op, twidth, theight); - else - kernel<<>>(src, buf, WithOutMask(), op, twidth, theight); - cudaSafeCall( cudaGetLastError() ); + return func(src, mask, buf); +} - cudaSafeCall( cudaDeviceSynchronize() ); +cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf) +{ + typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + static const func_t funcs[7][4] = + { + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl} + }; - R result[4] = {0, 0, 0, 0}; - cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) ); + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; - } + CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - template struct SumType; - template <> struct SumType { typedef unsigned int R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef unsigned int R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef float R; }; - template <> struct SumType { typedef double R; }; - - template - void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask) - { - typedef typename SumType::R R; - caller(src, buf, out, mask); - } + const int res_depth = std::max(src.depth(), CV_32F); + cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template - void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask) - { - typedef typename SumType::R R; - caller(src, buf, out, mask); - } + const func_t func = funcs[src.depth()][src.channels() - 1]; - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template struct Sqr : unary_function + return func(src, mask, buf); +} + +cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf) +{ + typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + static const func_t funcs[7][4] = { - __device__ __forceinline__ T operator ()(T x) const - { - return x * x; - } + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, + {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl} }; - template - void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask) - { - caller(src, buf, out, mask); - } + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); + + CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + + const int res_depth = CV_64F; + cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf); + + const func_t func = funcs[src.depth()][src.channels() - 1]; - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); - template void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask); + return func(src, mask, buf); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index 9dd5817..d4e53b7 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -187,137 +187,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT } //////////////////////////////////////////////////////////////////////// -// Sum - -namespace sum -{ - void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows); - - template - void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); - - template - void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); - - template - void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); -} - -Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); - - typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); - static const func_t funcs[7][5] = - { - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run}, - {0, ::sum::run, ::sum::run, ::sum::run, ::sum::run} - }; - - CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - - if (src.depth() == CV_64F) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - } - - Size buf_size; - ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); - ensureSizeIsEnough(buf_size, CV_8U, buf); - buf.setTo(Scalar::all(0)); - - const func_t func = funcs[src.depth()][src.channels()]; - - double result[4]; - func(src, buf.data, result, mask); - - return Scalar(result[0], result[1], result[2], result[3]); -} - -Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); - - typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); - static const func_t funcs[7][5] = - { - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs}, - {0, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs, ::sum::runAbs} - }; - - CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - - if (src.depth() == CV_64F) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - } - - Size buf_size; - ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); - ensureSizeIsEnough(buf_size, CV_8U, buf); - buf.setTo(Scalar::all(0)); - - const func_t func = funcs[src.depth()][src.channels()]; - - double result[4]; - func(src, buf.data, result, mask); - - return Scalar(result[0], result[1], result[2], result[3]); -} - -Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); - - typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask); - static const func_t funcs[7][5] = - { - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr}, - {0, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr, ::sum::runSqr} - }; - - CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - - if (src.depth() == CV_64F) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - } - - Size buf_size; - ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); - ensureSizeIsEnough(buf_size, CV_8U, buf); - buf.setTo(Scalar::all(0)); - - const func_t func = funcs[src.depth()][src.channels()]; - - double result[4]; - func(src, buf.data, result, mask); - - return Scalar(result[0], result[1], result[2], result[3]); -} - -//////////////////////////////////////////////////////////////////////// // minMax namespace minMax diff --git a/modules/cudev/CMakeLists.txt b/modules/cudev/CMakeLists.txt index 3c7b059..3ea7790 100644 --- a/modules/cudev/CMakeLists.txt +++ b/modules/cudev/CMakeLists.txt @@ -4,7 +4,7 @@ endif() set(the_description "CUDA device layer") -ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable) +ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare) ocv_add_module(cudev) diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp index 3279e19..c220a94 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp @@ -418,9 +418,7 @@ namespace grid_reduce_detail const dim3 block(Policy::block_size_x, Policy::block_size_y); const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y)); - const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y; - - glob_reduce<<>>(src, result, mask, rows, cols); + glob_reduce<<>>(src, result, mask, rows, cols); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) @@ -433,10 +431,9 @@ namespace grid_reduce_detail __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { typedef typename PtrTraits::value_type src_type; - const int cn = VecTraits::cn; - typedef typename MakeVec::type work_type; + typedef typename VecTraits::elem_type res_elem_type; - glob_reduce, Policy>(src, result, mask, rows, cols, stream); + glob_reduce, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream); } template diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp index 5872ad1..f300946 100644 --- a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp @@ -59,6 +59,10 @@ namespace cv { namespace cudev { template __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) { + typedef typename PtrTraits::value_type src_type; + + CV_StaticAssert( VecTraits::cn == VecTraits::cn, "" ); + dst.create(1, 1); dst.setTo(0, stream); @@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_& dst, const MaskP template __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_& dst, Stream& stream = Stream::Null()) { + typedef typename PtrTraits::value_type src_type; + + CV_StaticAssert( VecTraits::cn == VecTraits::cn, "" ); + dst.create(1, 1); dst.setTo(0, stream); diff --git a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp index 3ce265e..361ef7b 100644 --- a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp +++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp @@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint) return VecTraits::make(func (a.x), func (a.y), func (a.z), func (a.w)); \ } +namespace vec_math_detail +{ + __device__ __forceinline__ schar abs_(schar val) + { + return (schar) ::abs((int) val); + } + + __device__ __forceinline__ short abs_(short val) + { + return (short) ::abs((int) val); + } +} + CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar) -CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char) +CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, char, char) CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort) -CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short) +CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, short, short) CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int) CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint) CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float) -- 2.7.4