From: Alexey Spizhevoy Date: Fri, 26 Nov 2010 17:12:48 +0000 (+0000) Subject: added first version of gpu::countNonZero for all data types, it doesn't support compu... X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~8296 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7e2cc1be1b528c5bee54f03c093189d5afb47844;p=platform%2Fupstream%2Fopencv.git added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs --- diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 0d1e948..41017a4 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -434,6 +434,11 @@ namespace cv CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, GpuMat& valbuf, GpuMat& locbuf); + //! counts non-zero array elements + CV_EXPORTS int countNonZero(const GpuMat& src); + + //! counts non-zero array elements + CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! destination array will have the depth type as lut and the same channels number as source diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 9d0fa75..0fe9e5c 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -69,6 +69,8 @@ void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); } +int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; } +int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); } @@ -527,7 +529,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& int major, minor; getComputeCapability(getDevice(), major, minor); - if (major >= 1 && minor >= 1) + if (major > 1 || (major == 1 && minor >= 1)) { switch (src_.type()) { @@ -538,7 +540,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& case CV_32S: min_max_caller(src_, minVal, maxVal, buf); break; case CV_32F: min_max_caller(src_, minVal, maxVal, buf); break; case CV_64F: min_max_caller(src_, minVal, maxVal, buf); break; - default: CV_Error(CV_StsBadArg, "Unsupported type"); + default: CV_Error(CV_StsBadArg, "minMax: unsupported type"); } } else @@ -551,7 +553,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& case CV_16S: min_max_caller_2steps(src_, minVal, maxVal, buf); break; case CV_32S: min_max_caller_2steps(src_, minVal, maxVal, buf); break; case CV_32F: min_max_caller_2steps(src_, minVal, maxVal, buf); break; - default: CV_Error(CV_StsBadArg, "Unsupported type"); + default: CV_Error(CV_StsBadArg, "minMax: unsupported type"); } } } @@ -601,7 +603,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point int major, minor; getComputeCapability(getDevice(), major, minor); - if (major >= 1 && minor >= 1) + if (major > 1 || (major == 1 && minor >= 1)) { switch (src.type()) { @@ -612,7 +614,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point case CV_32S: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; case CV_32F: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; case CV_64F: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; - default: CV_Error(CV_StsBadArg, "Unsupported type"); + default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); } } else @@ -625,7 +627,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point case CV_16S: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; case CV_32S: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; case CV_32F: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; - default: CV_Error(CV_StsBadArg, "Unsupported type"); + default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); } } @@ -634,6 +636,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point } //////////////////////////////////////////////////////////////////////// +// Count non zero + +namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { + + void get_buf_size_required(int& cols, int& rows); + + template + int count_non_zero_caller(const DevMem2D src, PtrStep buf); + + template + int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf); + +}}}} + +int cv::gpu::countNonZero(const GpuMat& src) +{ + GpuMat buf; + return countNonZero(src, buf); +} + +int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) +{ + using namespace mathfunc::countnonzero; + CV_Assert(src.channels() == 1); + + Size buf_size; + get_buf_size_required(buf_size.width, buf_size.height); + buf.create(buf_size, CV_8U); + + switch (src.type()) + { + case CV_8U: return count_non_zero_caller(src, buf); + case CV_8S: return count_non_zero_caller(src, buf); + case CV_16U: return count_non_zero_caller(src, buf); + case CV_16S: return count_non_zero_caller(src, buf); + case CV_32S: return count_non_zero_caller(src, buf); + case CV_32F: return count_non_zero_caller(src, buf); + case CV_64F: return count_non_zero_caller(src, buf); + } + + CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); + return 0; +} + +//////////////////////////////////////////////////////////////////////// // LUT void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index b89a0d7..a70ae69 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -615,6 +615,8 @@ namespace cv { namespace gpu { namespace mathfunc } // namespace minmax +/////////////////////////////////////////////////////////////////////////////// +// minMaxLoc namespace minmaxloc { @@ -868,4 +870,126 @@ namespace cv { namespace gpu { namespace mathfunc } // namespace minmaxloc +////////////////////////////////////////////////////////////////////////////////////////////////////////// +// countNonZero + + namespace countnonzero + { + + __constant__ int ctwidth; + __constant__ int ctheight; + + static const unsigned int czero = 0; + + __device__ unsigned int blocks_finished; + + void estimate_thread_cfg(dim3& threads, dim3& grid) + { + threads = dim3(64, 4); + grid = dim3(6, 5); + } + + + void get_buf_size_required(int& cols, int& rows) + { + dim3 threads, grid; + estimate_thread_cfg(threads, grid); + cols = grid.x * grid.y * sizeof(int); + rows = 1; + } + + + void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); + } + + + template + __global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count) + { + __shared__ unsigned int scount[nthreads]; + + unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + + unsigned int cnt = 0; + for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + { + const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); + for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + cnt += ptr[x0 + x * blockDim.x] != 0; + } + + scount[tid] = cnt; + __syncthreads(); + + for (unsigned int step = nthreads / 2; step > 0; step >>= 1) + { + if (tid < step) scount[tid] += scount[tid + step]; + __syncthreads(); + } + + __shared__ bool is_last; + + if (tid == 0) + { + count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; + __threadfence(); + + unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + is_last = ticket == gridDim.x * gridDim.y - 1; + } + + __syncthreads(); + + if (is_last) + { + scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; + + for (unsigned int step = nthreads / 2; step > 0; step >>= 1) + { + if (tid < step) scount[tid] += scount[tid + step]; + __syncthreads(); + } + + if (tid == 0) count[0] = scount[0]; + } + + } + + + template + int count_non_zero_caller(const DevMem2D src, PtrStep buf) + { + dim3 threads, grid; + estimate_thread_cfg(threads, grid); + estimate_kernel_consts(src.cols, src.rows, threads, grid); + + unsigned int* count_buf = (unsigned int*)buf.ptr(0); + + cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); + count_non_zero_kernel<256, T><<>>(src, count_buf); + cudaSafeCall(cudaThreadSynchronize()); + + unsigned int count; + cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); + + return count; + } + + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); + + } // namespace countnonzero + }}} diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 6f712ac..5346c58 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -689,9 +689,7 @@ struct CV_GpuMinMaxTest: public CvTest for (int cn = 1; cn <= 4; ++cn) for (int depth = CV_8U; depth <= depth_end; ++depth) { - int rows = 1, cols = 3; - test(rows, cols, cn, depth); - for (int i = 0; i < 4; ++i) + for (int i = 0; i < 1; ++i) { int rows = 1 + rand() % 1000; int cols = 1 + rand() % 1000; @@ -821,6 +819,59 @@ struct CV_GpuMinMaxLocTest: public CvTest } }; +//////////////////////////////////////////////////////////////////////////// +// Count non zero +struct CV_GpuCountNonZeroTest: CvTest +{ + CV_GpuCountNonZeroTest(): CvTest("GPU-CountNonZeroTest", "countNonZero") {} + + void run(int) + { + srand(0); + int depth_end; + int major, minor; + cv::gpu::getComputeCapability(getDevice(), major, minor); + + if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F; + for (int depth = CV_8U; depth <= depth_end; ++depth) + { + for (int i = 0; i < 4; ++i) + { + int rows = 1 + rand() % 1000; + int cols = 1 + rand() % 1000; + test(rows, cols, depth); + } + } + } + + void test(int rows, int cols, int depth) + { + cv::Mat src(rows, cols, depth); + cv::RNG rng; + if (depth == 5) + rng.fill(src, RNG::UNIFORM, Scalar(-1000.f), Scalar(1000.f)); + else if (depth == 6) + rng.fill(src, RNG::UNIFORM, Scalar(-1000.), Scalar(1000.)); + else + for (int i = 0; i < src.rows; ++i) + { + Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i)); + rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255)); + } + + int n_gold = cv::countNonZero(src); + int n = cv::gpu::countNonZero(cv::gpu::GpuMat(src)); + + if (n != n_gold) + { + ts->printf(CvTS::CONSOLE, "%d %d %d %d %d\n", n, n_gold, depth, cols, rows); + n_gold = cv::countNonZero(src); + } + + CHECK(n == n_gold, CvTS::FAIL_INVALID_OUTPUT); + } +}; + ///////////////////////////////////////////////////////////////////////////// /////////////////// tests registration ///////////////////////////////////// @@ -850,3 +901,4 @@ CV_GpuNppImageCartToPolarTest CV_GpuNppImageCartToPolar_test; CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test; CV_GpuMinMaxTest CV_GpuMinMaxTest_test; CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test; +CV_GpuCountNonZeroTest CV_CountNonZero_test;