CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, \r
GpuMat& valbuf, GpuMat& locbuf);\r
\r
+ //! counts non-zero array elements\r
+ CV_EXPORTS int countNonZero(const GpuMat& src);\r
+\r
+ //! counts non-zero array elements\r
+ CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);\r
\r
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))\r
//! destination array will have the depth type as lut and the same channels number as source\r
void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }\r
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); }\r
+int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }\r
+int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }\r
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
int major, minor;\r
getComputeCapability(getDevice(), major, minor);\r
\r
- if (major >= 1 && minor >= 1)\r
+ if (major > 1 || (major == 1 && minor >= 1))\r
{\r
switch (src_.type())\r
{\r
case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;\r
case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;\r
case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;\r
- default: CV_Error(CV_StsBadArg, "Unsupported type");\r
+ default: CV_Error(CV_StsBadArg, "minMax: unsupported type");\r
}\r
}\r
else\r
case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;\r
case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;\r
case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;\r
- default: CV_Error(CV_StsBadArg, "Unsupported type");\r
+ default: CV_Error(CV_StsBadArg, "minMax: unsupported type");\r
}\r
}\r
}\r
int major, minor;\r
getComputeCapability(getDevice(), major, minor);\r
\r
- if (major >= 1 && minor >= 1)\r
+ if (major > 1 || (major == 1 && minor >= 1))\r
{ \r
switch (src.type())\r
{\r
case CV_32S: min_max_loc_caller<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
case CV_32F: min_max_loc_caller<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
case CV_64F: min_max_loc_caller<double>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
- default: CV_Error(CV_StsBadArg, "Unsupported type");\r
+ default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");\r
}\r
}\r
else\r
case CV_16S: min_max_loc_caller_2steps<signed short>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
case CV_32S: min_max_loc_caller_2steps<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
case CV_32F: min_max_loc_caller_2steps<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;\r
- default: CV_Error(CV_StsBadArg, "Unsupported type");\r
+ default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");\r
}\r
}\r
\r
}\r
\r
////////////////////////////////////////////////////////////////////////\r
+// Count non zero\r
+\r
+namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {\r
+\r
+ void get_buf_size_required(int& cols, int& rows);\r
+\r
+ template <typename T> \r
+ int count_non_zero_caller(const DevMem2D src, PtrStep buf);\r
+\r
+ template <typename T> \r
+ int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf);\r
+\r
+}}}}\r
+\r
+int cv::gpu::countNonZero(const GpuMat& src)\r
+{\r
+ GpuMat buf;\r
+ return countNonZero(src, buf);\r
+}\r
+\r
+int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)\r
+{\r
+ using namespace mathfunc::countnonzero;\r
+ CV_Assert(src.channels() == 1);\r
+\r
+ Size buf_size;\r
+ get_buf_size_required(buf_size.width, buf_size.height);\r
+ buf.create(buf_size, CV_8U);\r
+\r
+ switch (src.type())\r
+ {\r
+ case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);\r
+ case CV_8S: return count_non_zero_caller<signed char>(src, buf);\r
+ case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);\r
+ case CV_16S: return count_non_zero_caller<signed short>(src, buf);\r
+ case CV_32S: return count_non_zero_caller<int>(src, buf);\r
+ case CV_32F: return count_non_zero_caller<float>(src, buf);\r
+ case CV_64F: return count_non_zero_caller<double>(src, buf);\r
+ }\r
+\r
+ CV_Error(CV_StsBadArg, "countNonZero: unsupported type");\r
+ return 0;\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////////\r
// LUT\r
\r
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)\r
\r
} // namespace minmax\r
\r
+///////////////////////////////////////////////////////////////////////////////\r
+// minMaxLoc\r
\r
namespace minmaxloc {\r
\r
\r
} // namespace minmaxloc\r
\r
+//////////////////////////////////////////////////////////////////////////////////////////////////////////\r
+// countNonZero\r
+\r
+ namespace countnonzero \r
+ {\r
+\r
+ __constant__ int ctwidth;\r
+ __constant__ int ctheight;\r
+\r
+ static const unsigned int czero = 0;\r
+\r
+ __device__ unsigned int blocks_finished;\r
+\r
+ void estimate_thread_cfg(dim3& threads, dim3& grid)\r
+ {\r
+ threads = dim3(64, 4);\r
+ grid = dim3(6, 5);\r
+ }\r
+\r
+\r
+ void get_buf_size_required(int& cols, int& rows)\r
+ {\r
+ dim3 threads, grid;\r
+ estimate_thread_cfg(threads, grid);\r
+ cols = grid.x * grid.y * sizeof(int);\r
+ rows = 1;\r
+ }\r
+\r
+\r
+ void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)\r
+ { \r
+ int twidth = divUp(divUp(cols, grid.x), threads.x);\r
+ int theight = divUp(divUp(rows, grid.y), threads.y);\r
+ cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); \r
+ cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
+ }\r
+\r
+\r
+ template <int nthreads, typename T>\r
+ __global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)\r
+ {\r
+ __shared__ unsigned int scount[nthreads];\r
+\r
+ unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ unsigned int cnt = 0;\r
+ for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
+ for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ cnt += ptr[x0 + x * blockDim.x] != 0;\r
+ }\r
+\r
+ scount[tid] = cnt;\r
+ __syncthreads();\r
+\r
+ for (unsigned int step = nthreads / 2; step > 0; step >>= 1)\r
+ {\r
+ if (tid < step) scount[tid] += scount[tid + step];\r
+ __syncthreads();\r
+ }\r
+\r
+ __shared__ bool is_last;\r
+\r
+ if (tid == 0)\r
+ {\r
+ count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
+ __threadfence();\r
+\r
+ unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
+\r
+ for (unsigned int step = nthreads / 2; step > 0; step >>= 1)\r
+ {\r
+ if (tid < step) scount[tid] += scount[tid + step];\r
+ __syncthreads();\r
+ }\r
+\r
+ if (tid == 0) count[0] = scount[0];\r
+ }\r
+\r
+ }\r
+\r
+ \r
+ template <typename T>\r
+ int count_non_zero_caller(const DevMem2D src, PtrStep buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimate_thread_cfg(threads, grid);\r
+ estimate_kernel_consts(src.cols, src.rows, threads, grid);\r
+\r
+ unsigned int* count_buf = (unsigned int*)buf.ptr(0);\r
+\r
+ cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));\r
+ count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+\r
+ unsigned int count;\r
+ cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ \r
+ return count;\r
+ } \r
+\r
+ template int count_non_zero_caller<unsigned char>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<signed char>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<unsigned short>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<signed short>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<int>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<float>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller<double>(const DevMem2D, PtrStep);\r
+\r
+ } // namespace countnonzero\r
+\r
}}}\r
for (int cn = 1; cn <= 4; ++cn)\r
for (int depth = CV_8U; depth <= depth_end; ++depth)\r
{\r
- int rows = 1, cols = 3;\r
- test(rows, cols, cn, depth);\r
- for (int i = 0; i < 4; ++i)\r
+ for (int i = 0; i < 1; ++i)\r
{\r
int rows = 1 + rand() % 1000;\r
int cols = 1 + rand() % 1000;\r
} \r
};\r
\r
+////////////////////////////////////////////////////////////////////////////\r
+// Count non zero\r
+struct CV_GpuCountNonZeroTest: CvTest \r
+{\r
+ CV_GpuCountNonZeroTest(): CvTest("GPU-CountNonZeroTest", "countNonZero") {}\r
+\r
+ void run(int) \r
+ {\r
+ srand(0);\r
+ int depth_end;\r
+ int major, minor;\r
+ cv::gpu::getComputeCapability(getDevice(), major, minor);\r
+\r
+ if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;\r
+ for (int depth = CV_8U; depth <= depth_end; ++depth)\r
+ {\r
+ for (int i = 0; i < 4; ++i)\r
+ {\r
+ int rows = 1 + rand() % 1000;\r
+ int cols = 1 + rand() % 1000;\r
+ test(rows, cols, depth);\r
+ }\r
+ }\r
+ }\r
+\r
+ void test(int rows, int cols, int depth)\r
+ {\r
+ cv::Mat src(rows, cols, depth);\r
+ cv::RNG rng;\r
+ if (depth == 5)\r
+ rng.fill(src, RNG::UNIFORM, Scalar(-1000.f), Scalar(1000.f));\r
+ else if (depth == 6)\r
+ rng.fill(src, RNG::UNIFORM, Scalar(-1000.), Scalar(1000.));\r
+ else\r
+ for (int i = 0; i < src.rows; ++i)\r
+ { \r
+ Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i));\r
+ rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255));\r
+ }\r
+\r
+ int n_gold = cv::countNonZero(src);\r
+ int n = cv::gpu::countNonZero(cv::gpu::GpuMat(src));\r
+\r
+ if (n != n_gold)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "%d %d %d %d %d\n", n, n_gold, depth, cols, rows);\r
+ n_gold = cv::countNonZero(src);\r
+ }\r
+\r
+ CHECK(n == n_gold, CvTS::FAIL_INVALID_OUTPUT);\r
+ }\r
+};\r
+\r
\r
/////////////////////////////////////////////////////////////////////////////\r
/////////////////// tests registration /////////////////////////////////////\r
CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;\r
CV_GpuMinMaxTest CV_GpuMinMaxTest_test;\r
CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;\r
+CV_GpuCountNonZeroTest CV_CountNonZero_test;\r