From 020624c481be8457169cf170dc7aa78a9359e5ea Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 26 Aug 2013 11:25:56 +0400 Subject: [PATCH] used new device layer for cv::gpu::minMaxLoc --- modules/cudaarithm/src/cuda/minmaxloc.cu | 231 ++++++++----------------------- modules/cudaarithm/src/reductions.cpp | 50 ------- 2 files changed, 61 insertions(+), 220 deletions(-) diff --git a/modules/cudaarithm/src/cuda/minmaxloc.cu b/modules/cudaarithm/src/cuda/minmaxloc.cu index 2374504..6f8cc53 100644 --- a/modules/cudaarithm/src/cuda/minmaxloc.cu +++ b/modules/cudaarithm/src/cuda/minmaxloc.cu @@ -40,197 +40,88 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" -#include "opencv2/core/cuda/vec_math.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/reduce.hpp" -#include "opencv2/core/cuda/emulation.hpp" -#include "opencv2/core/cuda/limits.hpp" -#include "opencv2/core/cuda/utility.hpp" +#ifndef HAVE_OPENCV_CUDEV -using namespace cv::cuda; -using namespace cv::cuda::device; +#error "opencv_cudev is required" -namespace minMaxLoc -{ - // To avoid shared bank conflicts we convert each value into value of - // appropriate type (32 bits minimum) - template struct MinMaxTypeTraits; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef float best_type; }; - template <> struct MinMaxTypeTraits { typedef double best_type; }; - - template - __global__ void kernel_pass_1(const PtrStepSz src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight) - { - typedef typename MinMaxTypeTraits::best_type work_type; - - __shared__ work_type sminval[BLOCK_SIZE]; - __shared__ work_type smaxval[BLOCK_SIZE]; - __shared__ unsigned int sminloc[BLOCK_SIZE]; - __shared__ unsigned int smaxloc[BLOCK_SIZE]; +#else - const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y; +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; +using namespace cv::cudev; - work_type mymin = numeric_limits::max(); - work_type mymax = -numeric_limits::max(); - unsigned int myminloc = 0; - unsigned int mymaxloc = 0; - - for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y) - { - const T* ptr = src.ptr(y); - - for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x) - { - if (mask(y, x)) - { - const work_type srcVal = ptr[x]; - - if (srcVal < mymin) - { - mymin = srcVal; - myminloc = y * src.cols + x; - } - - if (srcVal > mymax) - { - mymax = srcVal; - mymaxloc = y * src.cols + x; - } - } - } - } - - reduceKeyVal(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), - smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc), - tid, - thrust::make_tuple(less(), greater())); - - if (tid == 0) - { - minval[bid] = (T) mymin; - maxval[bid] = (T) mymax; - minloc[bid] = myminloc; - maxloc[bid] = mymaxloc; - } - } - template - __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count) +namespace +{ + template + void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc) { - typedef typename MinMaxTypeTraits::best_type work_type; + typedef typename SelectIf< + TypesEquals::value, + double, + typename SelectIf::value, float, int>::type + >::type work_type; + + const GpuMat_& src = (const GpuMat_&) _src; + GpuMat_& valBuf = (GpuMat_&) _valBuf; + GpuMat_& locBuf = (GpuMat_&) _locBuf; + + if (mask.empty()) + gridMinMaxLoc(src, valBuf, locBuf); + else + gridMinMaxLoc(src, valBuf, locBuf, globPtr(mask)); - __shared__ work_type sminval[BLOCK_SIZE]; - __shared__ work_type smaxval[BLOCK_SIZE]; - __shared__ unsigned int sminloc[BLOCK_SIZE]; - __shared__ unsigned int smaxloc[BLOCK_SIZE]; + cv::Mat_ h_valBuf; + cv::Mat_ h_locBuf; - unsigned int idx = ::min(threadIdx.x, count - 1); + valBuf.download(h_valBuf); + locBuf.download(h_locBuf); - work_type mymin = minval[idx]; - work_type mymax = maxval[idx]; - unsigned int myminloc = minloc[idx]; - unsigned int mymaxloc = maxloc[idx]; + if (minVal) + *minVal = h_valBuf(0, 0); - reduceKeyVal(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), - smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc), - threadIdx.x, - thrust::make_tuple(less(), greater())); + if (maxVal) + *maxVal = h_valBuf(1, 0); - if (threadIdx.x == 0) + if (minLoc) { - minval[0] = (T) mymin; - maxval[0] = (T) mymax; - minloc[0] = myminloc; - maxloc[0] = mymaxloc; + const int idx = h_locBuf(0, 0); + *minLoc = cv::Point(idx % src.cols, idx / src.cols); } - } - - const int threads_x = 32; - const int threads_y = 8; - - void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid) - { - block = dim3(threads_x, threads_y); - grid = dim3(divUp(cols, block.x * block.y), - divUp(rows, block.y * block.x)); - - grid.x = ::min(grid.x, block.x); - grid.y = ::min(grid.y, block.y); - } - - void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows) - { - dim3 block, grid; - getLaunchCfg(cols, rows, block, grid); - - // For values - b1cols = (int)(grid.x * grid.y * elem_size); - b1rows = 2; - - // For locations - b2cols = grid.x * grid.y * sizeof(int); - b2rows = 2; + if (maxLoc) + { + const int idx = h_locBuf(1, 0); + *maxLoc = cv::Point(idx % src.cols, idx / src.cols); + } } +} - template - void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf) +void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf) +{ + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc); + static const func_t funcs[] = { - dim3 block, grid; - getLaunchCfg(src.cols, src.rows, block, grid); - - const int twidth = divUp(divUp(src.cols, grid.x), block.x); - const int theight = divUp(divUp(src.rows, grid.y), block.y); + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl + }; - T* minval_buf = (T*) valbuf.ptr(0); - T* maxval_buf = (T*) valbuf.ptr(1); - unsigned int* minloc_buf = locbuf.ptr(0); - unsigned int* maxloc_buf = locbuf.ptr(1); - - if (mask.data) - kernel_pass_1<<>>((PtrStepSz) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); - else - kernel_pass_1<<>>((PtrStepSz) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight); + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); - cudaSafeCall( cudaGetLastError() ); + CV_Assert( src.channels() == 1 ); + CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); - kernel_pass_2<<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - - unsigned int minloc_, maxloc_; - cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } + const func_t func = funcs[src.depth()]; - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - template void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); + func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index e3a8d6d..a56c8a1 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -186,56 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT return retVal; } -//////////////////////////////////////////////////////////////////////// -// minMaxLoc - -namespace minMaxLoc -{ - void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows); - - template - void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); -} - -void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - InputArray _mask, GpuMat& valBuf, GpuMat& locBuf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); - - typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep locbuf); - static const func_t funcs[] = - { - ::minMaxLoc::run, - ::minMaxLoc::run, - ::minMaxLoc::run, - ::minMaxLoc::run, - ::minMaxLoc::run, - ::minMaxLoc::run, - ::minMaxLoc::run - }; - - CV_Assert( src.channels() == 1 ); - CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); - - if (src.depth() == CV_64F) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - } - - Size valbuf_size, locbuf_size; - ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height); - ensureSizeIsEnough(valbuf_size, CV_8U, valBuf); - ensureSizeIsEnough(locbuf_size, CV_8U, locBuf); - - const func_t func = funcs[src.depth()]; - - double temp1, temp2; - Point temp3, temp4; - func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf); -} - ////////////////////////////////////////////////////////////////////////////// // countNonZero -- 2.7.4