From 282e01cb4aee753e5312c03f524ab4b8b63ab461 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Wed, 24 Nov 2010 08:55:52 +0000 Subject: [PATCH] added support of all data types into gpu::minMax --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 - modules/gpu/src/arithm.cpp | 80 +++++++++----------- modules/gpu/src/cuda/mathfunc.cu | 128 ++++++++++++++++++++++++++++++++ tests/gpu/src/arithm.cpp | 93 ++++++++++++++++------- tests/gpu/src/gputest_main.cpp | 1 - 5 files changed, 230 insertions(+), 74 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 42e30b1..70f9eda 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -422,8 +422,6 @@ namespace cv CV_EXPORTS Scalar sum(const GpuMat& m); //! finds global minimum and maximum array elements and returns their values - //! supports CV_8UC1 and CV_8UC4 type - //! disabled until fix npp bug CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal = 0); //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 2c6b3a9..b94db38 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -489,55 +489,45 @@ Scalar cv::gpu::sum(const GpuMat& src) //////////////////////////////////////////////////////////////////////// // minMax -namespace -{ - void minMax_c1(const GpuMat& src, double* minVal, double* maxVal) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Npp8u min_res, max_res; - - nppSafeCall( nppiMinMax_8u_C1R(src.ptr(), src.step, sz, &min_res, &max_res) ); - - if (minVal) - *minVal = min_res; - - if (maxVal) - *maxVal = max_res; - } - - void minMax_c4(const GpuMat& src, double* minVal, double* maxVal) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Npp8u* cuMem; - - cuMem = nppsMalloc_8u(8); - - nppSafeCall( nppiMinMax_8u_C4R(src.ptr(), src.step, sz, cuMem, cuMem + 4) ); - - if (minVal) - cudaMemcpy(minVal, cuMem, 4 * sizeof(Npp8u), cudaMemcpyDeviceToHost); - if (maxVal) - cudaMemcpy(maxVal, cuMem + 4, 4 * sizeof(Npp8u), cudaMemcpyDeviceToHost); - - nppsFree(cuMem); - } -} +namespace cv { namespace gpu { namespace mathfunc { + template + void min_max_caller(const DevMem2D src, double* minval, double* maxval); +}}} void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) { - typedef void (*minMax_t)(const GpuMat& src, double* minVal, double* maxVal); - static const minMax_t minMax_callers[] = {0, minMax_c1, 0, 0, minMax_c4}; + CV_Assert(src.channels() == 1); - CV_Assert(!"disabled until fix npp bug"); - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); - - minMax_callers[src.channels()](src, minVal, maxVal); + double maxVal_; + if (!maxVal) + maxVal = &maxVal_; + + switch (src.type()) + { + case CV_8U: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_8S: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_16U: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_16S: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_32S: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_32F: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + case CV_64F: + mathfunc::min_max_caller(src, minVal, maxVal); + break; + default: + CV_Error(CV_StsBadArg, "Unsupported type"); + } } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 73b76a8..d557288 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -393,4 +393,132 @@ namespace cv { namespace gpu { namespace mathfunc { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream); } + + + +////////////////////////////////////////////////////////////////////////////// +// Min max + + enum { MIN, MAX }; + template struct Cmp {}; + + template + struct Cmp + { + static __device__ void call(unsigned int tid, unsigned int offset, volatile T* optval) + { + T val = optval[tid + offset]; + if (val < optval[tid]) optval[tid] = val; + //optval[tid] = min(optval[tid], optval[tid + offset]); + } + }; + + template + struct Cmp + { + static __device__ void call(unsigned int tid, unsigned int offset, volatile T* optval) + { + T val = optval[tid + offset]; + if (val > optval[tid]) optval[tid] = val; + //optval[tid] = max(optval[tid], optval[tid + offset]); + } + }; + + + template + __global__ void opt_kernel(int cols, int rows, const PtrStep src, PtrStep optval) + { + __shared__ T soptval[nthreads]; + + unsigned int x0 = blockIdx.x * blockDim.x; + unsigned int y0 = blockIdx.y * blockDim.y; + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (x0 + threadIdx.x < cols && y0 + threadIdx.y < rows) + soptval[tid] = ((const T*)src.ptr(y0 + threadIdx.y))[x0 + threadIdx.x]; + else + soptval[tid] = ((const T*)src.ptr(y0))[x0]; + + __syncthreads(); + + if (nthreads >= 512) if (tid < 256) { Cmp::call(tid, 256, soptval); __syncthreads(); } + if (nthreads >= 256) if (tid < 128) { Cmp::call(tid, 128, soptval); __syncthreads(); } + if (nthreads >= 128) if (tid < 64) { Cmp::call(tid, 64, soptval); __syncthreads(); } + + if (tid < 32) + { + if (nthreads >= 64) Cmp::call(tid, 32, soptval); + if (nthreads >= 32) Cmp::call(tid, 16, soptval); + if (nthreads >= 16) Cmp::call(tid, 8, soptval); + if (nthreads >= 8) Cmp::call(tid, 4, soptval); + if (nthreads >= 4) Cmp::call(tid, 2, soptval); + if (nthreads >= 2) Cmp::call(tid, 1, soptval); + } + + if (tid == 0) ((T*)optval.ptr(blockIdx.y))[blockIdx.x] = soptval[0]; + } + + + template + void min_max_caller(const DevMem2D src, double* minval, double* maxval) + { + dim3 threads(32, 8); + + // Allocate memory for aux. buffers + DevMem2D minval_buf[2]; DevMem2D maxval_buf[2]; + minval_buf[0].cols = divUp(src.cols, threads.x); + minval_buf[0].rows = divUp(src.rows, threads.y); + minval_buf[1].cols = divUp(minval_buf[0].cols, threads.x); + minval_buf[1].rows = divUp(minval_buf[0].rows, threads.y); + maxval_buf[0].cols = divUp(src.cols, threads.x); + maxval_buf[0].rows = divUp(src.rows, threads.y); + maxval_buf[1].cols = divUp(maxval_buf[0].cols, threads.x); + maxval_buf[1].rows = divUp(maxval_buf[0].rows, threads.y); + cudaSafeCall(cudaMallocPitch(&minval_buf[0].data, &minval_buf[0].step, minval_buf[0].cols * sizeof(T), minval_buf[0].rows)); + cudaSafeCall(cudaMallocPitch(&minval_buf[1].data, &minval_buf[1].step, minval_buf[1].cols * sizeof(T), minval_buf[1].rows)); + cudaSafeCall(cudaMallocPitch(&maxval_buf[0].data, &maxval_buf[0].step, maxval_buf[0].cols * sizeof(T), maxval_buf[0].rows)); + cudaSafeCall(cudaMallocPitch(&maxval_buf[1].data, &maxval_buf[1].step, maxval_buf[1].cols * sizeof(T), maxval_buf[1].rows)); + + int curbuf = 0; + dim3 cursize(src.cols, src.rows); + dim3 grid(divUp(cursize.x, threads.x), divUp(cursize.y, threads.y)); + + opt_kernel<256, Cmp, T><<>>(cursize.x, cursize.y, src, minval_buf[curbuf]); + opt_kernel<256, Cmp, T><<>>(cursize.x, cursize.y, src, maxval_buf[curbuf]); + cursize = grid; + + while (cursize.x > 1 || cursize.y > 1) + { + grid.x = divUp(cursize.x, threads.x); + grid.y = divUp(cursize.y, threads.y); + opt_kernel<256, Cmp, T><<>>(cursize.x, cursize.y, minval_buf[curbuf], minval_buf[1 - curbuf]); + opt_kernel<256, Cmp, T><<>>(cursize.x, cursize.y, maxval_buf[curbuf], maxval_buf[1 - curbuf]); + curbuf = 1 - curbuf; + cursize = grid; + } + + cudaSafeCall(cudaThreadSynchronize()); + + // Copy results from device to host + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf[curbuf].ptr(0), sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf[curbuf].ptr(0), sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; + + // Release aux. buffers + cudaSafeCall(cudaFree(minval_buf[0].data)); + cudaSafeCall(cudaFree(minval_buf[1].data)); + cudaSafeCall(cudaFree(maxval_buf[0].data)); + cudaSafeCall(cudaFree(maxval_buf[1].data)); + } + + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + template void min_max_caller(const DevMem2D, double*, double*); + }}} diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 0c4fd1b..f159d9d 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -48,6 +48,11 @@ using namespace cv; using namespace std; using namespace gpu; +#define CHECK(pred, err) if (!(pred)) { \ + ts->printf(CvTS::LOG, "Fail: \"%s\" at line: %d\n", #pred, __LINE__); \ + ts->set_failed_test_info(err); \ + return; } + class CV_GpuArithmTest : public CvTest { public: @@ -477,31 +482,6 @@ struct CV_GpuNppImageSumTest : public CV_GpuArithmTest }; //////////////////////////////////////////////////////////////////////////////// -// minNax -struct CV_GpuNppImageMinNaxTest : public CV_GpuArithmTest -{ - CV_GpuNppImageMinNaxTest() : CV_GpuArithmTest( "GPU-NppImageMinNax", "minNax" ) {} - - int test( const Mat& mat1, const Mat& ) - { - if (mat1.type() != CV_8UC1) - { - ts->printf(CvTS::LOG, "\nUnsupported type\n"); - return CvTS::OK; - } - - double cpumin, cpumax; - cv::minMaxLoc(mat1, &cpumin, &cpumax); - - GpuMat gpu1(mat1); - double gpumin, gpumax; - cv::gpu::minMax(gpu1, &gpumin, &gpumax); - - return (CheckNorm(cpumin, gpumin) == CvTS::OK && CheckNorm(cpumax, gpumax) == CvTS::OK) ? CvTS::OK : CvTS::FAIL_GENERIC; - } -}; - -//////////////////////////////////////////////////////////////////////////////// // LUT struct CV_GpuNppImageLUTTest : public CV_GpuArithmTest { @@ -689,6 +669,67 @@ struct CV_GpuNppImagePolarToCartTest : public CV_GpuArithmTest } }; +//////////////////////////////////////////////////////////////////////////////// +// Min max + +struct CV_GpuMinMaxTest: public CvTest +{ + CV_GpuMinMaxTest(): CvTest("GPU-MinMaxTest", "minMax") {} + + void run(int) + { + for (int type = CV_8U; type <= CV_64F; ++type) + { + int rows = 1, cols = 3; + test(rows, cols, type); + for (int i = 0; i < 4; ++i) + { + int rows = 1 + rand() % 1000; + int cols = 1 + rand() % 1000; + test(rows, cols, type); + } + } + } + + void test(int rows, int cols, int type) + { + cv::Mat src(rows, cols, type); + cv::RNG rng; + for (int i = 0; i < src.rows; ++i) + { + Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i)); + rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255)); + } + + double minVal, maxVal; + if (type != CV_8S) + { + cv::Point minLoc, maxLoc; + cv::minMaxLoc(src, &minVal, &maxVal, &minLoc, &maxLoc); + } + else + { + // OpenCV's minMaxLoc doesn't support CV_8S type + minVal = std::numeric_limits::max(); + maxVal = std::numeric_limits::min(); + for (int i = 0; i < src.rows; ++i) + for (int j = 0; j < src.cols; ++j) + { + char val = src.at(i, j); + if (val < minVal) minVal = val; + if (val > maxVal) maxVal = val; + } + } + + double minVal_, maxVal_; + cv::Point minLoc_, maxLoc_; + cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_); + + CHECK(minVal == minVal_, CvTS::FAIL_INVALID_OUTPUT); + CHECK(maxVal == maxVal_, CvTS::FAIL_INVALID_OUTPUT); + } +}; + ///////////////////////////////////////////////////////////////////////////// /////////////////// tests registration ///////////////////////////////////// @@ -709,7 +750,6 @@ CV_GpuNppImageMeanStdDevTest CV_GpuNppImageMeanStdDev_test; CV_GpuNppImageNormTest CV_GpuNppImageNorm_test; CV_GpuNppImageFlipTest CV_GpuNppImageFlip_test; CV_GpuNppImageSumTest CV_GpuNppImageSum_test; -CV_GpuNppImageMinNaxTest CV_GpuNppImageMinNax_test; CV_GpuNppImageLUTTest CV_GpuNppImageLUT_test; CV_GpuNppImageExpTest CV_GpuNppImageExp_test; CV_GpuNppImageLogTest CV_GpuNppImageLog_test; @@ -717,3 +757,4 @@ CV_GpuNppImageMagnitudeTest CV_GpuNppImageMagnitude_test; CV_GpuNppImagePhaseTest CV_GpuNppImagePhase_test; CV_GpuNppImageCartToPolarTest CV_GpuNppImageCartToPolar_test; CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test; +CV_GpuMinMaxTest CV_GpuMinMaxTest_test; diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp index f608de8..4bce0e1 100644 --- a/tests/gpu/src/gputest_main.cpp +++ b/tests/gpu/src/gputest_main.cpp @@ -48,7 +48,6 @@ const char* blacklist[] = "GPU-MatOperatorAsyncCall", // crash "GPU-NppImageSum", // crash, probably npp bug - "GPU-NppImageMinNax", // npp bug - don't find min/max near right border "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR 0 -- 2.7.4