From fa5422a2ad0f5fdb7476b1e0b6f76430a44e8e0d Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Thu, 25 Nov 2010 10:19:06 +0000 Subject: [PATCH] added buf support into gpu::minMax --- modules/gpu/include/opencv2/gpu/gpu.hpp | 3 + modules/gpu/src/arithm.cpp | 54 +++++++++--------- modules/gpu/src/cuda/mathfunc.cu | 98 ++++++++++++++++++--------------- tests/gpu/src/arithm.cpp | 4 +- 4 files changed, 90 insertions(+), 69 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4dcbd0b..0edd50a 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -424,6 +424,9 @@ namespace cv //! finds global minimum and maximum array elements and returns their values CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0); + //! finds global minimum and maximum array elements and returns their values + CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf); + //! finds global minimum and maximum array elements and returns their values with locations CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 5bd0022..ca31707 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -66,6 +66,7 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); } +void cv::gou::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } @@ -492,47 +493,50 @@ Scalar cv::gpu::sum(const GpuMat& src) namespace cv { namespace gpu { namespace mathfunc { namespace minmax { - void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, - int& b2cols, int& b2rows); + void get_buf_size_required(int elem_size, int& cols, int& rows); template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, - unsigned char* minval_buf, unsigned char* maxval_buf); + void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); template - void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, - unsigned char* minval_buf, unsigned char* maxval_buf); + void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf); }}}} void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) { + GpuMat buf; + minMax(src, minVal, maxVal, buf); +} + + +void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf) +{ using namespace mathfunc::minmax; double maxVal_; if (!maxVal) maxVal = &maxVal_; GpuMat src_ = src.reshape(1); - - // Allocate GPU buffers - Size b1size, b2size; - get_buf_size_required(src.elemSize(), b1size.width, b1size.height, b2size.width, b2size.height); - GpuMat b1(b1size, CV_8U), b2(b2size, CV_8U); + + Size bufSize; + get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height); + buf.create(bufSize, CV_8U); int major, minor; getComputeCapability(getDevice(), major, minor); - + if (major >= 1 && minor >= 1) { switch (src_.type()) { - case CV_8U: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_8S: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_16U: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_16S: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_32S: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_32F: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_64F: min_max_caller(src_, minVal, maxVal, b1.data, b2.data); break; + case CV_8U: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_8S: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_16U: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_16S: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_32S: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_32F: min_max_caller(src_, minVal, maxVal, buf); break; + case CV_64F: min_max_caller(src_, minVal, maxVal, buf); break; default: CV_Error(CV_StsBadArg, "Unsupported type"); } } @@ -540,12 +544,12 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) { switch (src_.type()) { - case CV_8U: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_8S: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_16U: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_16S: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_32S: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; - case CV_32F: min_max_caller_2steps(src_, minVal, maxVal, b1.data, b2.data); break; + case CV_8U: min_max_caller_2steps(src_, minVal, maxVal, buf); break; + case CV_8S: min_max_caller_2steps(src_, minVal, maxVal, buf); break; + case CV_16U: min_max_caller_2steps(src_, minVal, maxVal, buf); break; + case CV_16S: min_max_caller_2steps(src_, minVal, maxVal, buf); break; + case CV_32S: min_max_caller_2steps(src_, minVal, maxVal, buf); break; + case CV_32F: min_max_caller_2steps(src_, minVal, maxVal, buf); break; default: CV_Error(CV_StsBadArg, "Unsupported type"); } } diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 11afda3..8d5006f 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace mathfunc ////////////////////////////////////////////////////////////////////////////// // Min max - // To avoid shared banck confilict we convert reach value into value of + // To avoid shared bank conflicts we convert each value into value of // appropriate type (32 bits minimum) template struct MinMaxTypeTraits {}; template <> struct MinMaxTypeTraits { typedef int best_type; }; @@ -423,6 +423,10 @@ namespace cv { namespace gpu { namespace mathfunc static const unsigned int czero = 0; + // Global counter of blocks finished its work + __device__ unsigned int blocks_finished; + + // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 void estimate_thread_cfg(dim3& threads, dim3& grid) @@ -431,15 +435,17 @@ namespace cv { namespace gpu { namespace mathfunc grid = dim3(6, 5); } + // Returns required buffer sizes - void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows) + void get_buf_size_required(int elem_size, int& cols, int& rows) { dim3 threads, grid; estimate_thread_cfg(threads, grid); - b1cols = grid.x * grid.y * elem_size; b1rows = 1; - b2cols = grid.x * grid.y * elem_size; b2rows = 1; + cols = grid.x * grid.y * elem_size; + rows = 2; } + // Estimates device constants which are used in the kernels using specified thread configuration void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) { @@ -449,6 +455,7 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); } + // Does min and max in shared memory template __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval) @@ -457,8 +464,6 @@ namespace cv { namespace gpu { namespace mathfunc maxval[tid] = max(maxval[tid], maxval[tid + offset]); } - // Global counter of blocks finished its work - __device__ unsigned int blocks_finished; template __global__ void min_max_kernel(int cols, int rows, const PtrStep src, T* minval, T* maxval) @@ -535,33 +540,19 @@ namespace cv { namespace gpu { namespace mathfunc #endif } - // This kernel will be used only when compute capability is 1.0 - template - __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size) - { - T val; - T mymin = numeric_limits_gpu::max(); - T mymax = numeric_limits_gpu::min(); - for (unsigned int i = 0; i < size; ++i) - { - val = minval[i]; if (val < mymin) mymin = val; - val = maxval[i]; if (val > mymax) mymax = val; - } - minval[0] = mymin; - maxval[0] = mymax; - } template - void min_max_caller(const DevMem2D src, double* minval, double* maxval, - unsigned char* minval_buf, unsigned char* maxval_buf) + void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; estimate_thread_cfg(threads, grid); estimate_kernel_consts(src.cols, src.rows, threads, grid); - cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); - min_max_kernel<256, T><<>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf); + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); + min_max_kernel<256, T><<>>(src.cols, src.rows, src, minval_buf, maxval_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -569,19 +560,47 @@ namespace cv { namespace gpu { namespace mathfunc cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); *minval = minval_; *maxval = maxval_; + } + + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + + + // This kernel will be used only when compute capability is 1.0 + template + __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size) + { + T val; + T mymin = numeric_limits_gpu::max(); + T mymax = numeric_limits_gpu::min(); + for (unsigned int i = 0; i < size; ++i) + { + val = minval[i]; if (val < mymin) mymin = val; + val = maxval[i]; if (val > mymax) mymax = val; + } + minval[0] = mymin; + maxval[0] = mymax; } + template - void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, - unsigned char* minval_buf, unsigned char* maxval_buf) + void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; estimate_thread_cfg(threads, grid); estimate_kernel_consts(src.cols, src.rows, threads, grid); + T* minval_buf = (T*)buf.ptr(0); + T* maxval_buf = (T*)buf.ptr(1); + cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); - min_max_kernel<256, T><<>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf); - min_max_kernel_2ndstep<<<1, 1>>>((T*)minval_buf, (T*)maxval_buf, grid.x * grid.y); + min_max_kernel<256, T><<>>(src.cols, src.rows, src, minval_buf, maxval_buf); + min_max_kernel_2ndstep<<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -591,23 +610,16 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); - template void min_max_caller_2steps(const DevMem2D, double*, double*, unsigned char*, unsigned char*); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller_2steps(const DevMem2D, double*, double*, PtrStep); } // namespace minmax + namespace minmaxloc { template struct OptLoc {}; diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index cbdea2d..600203b 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -676,6 +676,8 @@ struct CV_GpuMinMaxTest: public CvTest { CV_GpuMinMaxTest(): CvTest("GPU-MinMaxTest", "minMax") {} + cv::gpu::GpuMat buf; + void run(int) { int depth_end; @@ -732,7 +734,7 @@ struct CV_GpuMinMaxTest: public CvTest double minVal_, maxVal_; cv::Point minLoc_, maxLoc_; - cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_); + cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_, buf); if (abs(minVal - minVal_) > 1e-3f) { -- 2.7.4