\r
CV_EXPORTS void getGpuMemInfo(size_t& free, size_t& total);\r
\r
+ CV_EXPORTS bool hasNativeDoubleSupport(int device);\r
+ CV_EXPORTS bool hasAtomicsSupport(int device);\r
+\r
//////////////////////////////// Error handling ////////////////////////\r
\r
CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);\r
get_buf_size_required(buf_size.width, buf_size.height);\r
buf.create(buf_size, CV_8U);\r
\r
- switch (src.type())\r
+ int device = getDevice();\r
+ if (hasAtomicsSupport(device))\r
+ { \r
+ switch (src.type())\r
+ {\r
+ case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);\r
+ case CV_8S: return count_non_zero_caller<signed char>(src, buf);\r
+ case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);\r
+ case CV_16S: return count_non_zero_caller<signed short>(src, buf);\r
+ case CV_32S: return count_non_zero_caller<int>(src, buf);\r
+ case CV_32F: return count_non_zero_caller<float>(src, buf);\r
+ case CV_64F: \r
+ if (hasNativeDoubleSupport(device)) \r
+ return count_non_zero_caller<double>(src, buf);\r
+ }\r
+ }\r
+ else\r
{\r
- case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);\r
- case CV_8S: return count_non_zero_caller<signed char>(src, buf);\r
- case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);\r
- case CV_16S: return count_non_zero_caller<signed short>(src, buf);\r
- case CV_32S: return count_non_zero_caller<int>(src, buf);\r
- case CV_32F: return count_non_zero_caller<float>(src, buf);\r
- case CV_64F: return count_non_zero_caller<double>(src, buf);\r
+ switch (src.type())\r
+ {\r
+ case CV_8U: return count_non_zero_caller_2steps<unsigned char>(src, buf);\r
+ case CV_8S: return count_non_zero_caller_2steps<signed char>(src, buf);\r
+ case CV_16U: return count_non_zero_caller_2steps<unsigned short>(src, buf);\r
+ case CV_16S: return count_non_zero_caller_2steps<signed short>(src, buf);\r
+ case CV_32S: return count_non_zero_caller_2steps<int>(src, buf);\r
+ case CV_32F: return count_non_zero_caller_2steps<float>(src, buf);\r
+ }\r
}\r
\r
CV_Error(CV_StsBadArg, "countNonZero: unsupported type");\r
}\r
\r
\r
+ template <int size, typename T>\r
+ __device__ void sum_shared_mem(volatile T* data, const unsigned int tid)\r
+ {\r
+ T sum = data[tid];\r
+\r
+ if (size >= 512) if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads();\r
+ if (size >= 256) if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads();\r
+ if (size >= 128) if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads();\r
+\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) data[tid] = sum = sum + data[tid + 32];\r
+ if (size >= 32) data[tid] = sum = sum + data[tid + 16];\r
+ if (size >= 16) data[tid] = sum = sum + data[tid + 8];\r
+ if (size >= 8) data[tid] = sum = sum + data[tid + 4];\r
+ if (size >= 4) data[tid] = sum = sum + data[tid + 2];\r
+ if (size >= 2) data[tid] = sum = sum + data[tid + 1];\r
+ }\r
+ }\r
+\r
+\r
template <int nthreads, typename T>\r
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)\r
{\r
scount[tid] = cnt;\r
__syncthreads();\r
\r
- for (unsigned int step = nthreads / 2; step > 0; step >>= 1)\r
- {\r
- if (tid < step) scount[tid] += scount[tid + step];\r
- __syncthreads();\r
- }\r
+ sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
\r
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
__shared__ bool is_last;\r
\r
if (tid == 0)\r
if (is_last)\r
{\r
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
-\r
- for (unsigned int step = nthreads / 2; step > 0; step >>= 1)\r
- {\r
- if (tid < step) scount[tid] += scount[tid + step];\r
- __syncthreads();\r
- }\r
-\r
+ sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
if (tid == 0) count[0] = scount[0];\r
}\r
-\r
+#else\r
+ if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];\r
+#endif\r
}\r
\r
\r
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);\r
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);\r
\r
+\r
+ template <int nthreads, typename T>\r
+ __global__ void count_non_zero_kernel_2ndstep(unsigned int* count, int size)\r
+ {\r
+ __shared__ unsigned int scount[nthreads];\r
+ unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ scount[tid] = tid < size ? count[tid] : 0;\r
+ sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
+\r
+ if (tid == 0) count[0] = scount[0];\r
+ }\r
+\r
+\r
+ template <typename T>\r
+ int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf)\r
+ {\r
+ dim3 threads, grid;\r
+ estimate_thread_cfg(threads, grid);\r
+ estimate_kernel_consts(src.cols, src.rows, threads, grid);\r
+\r
+ unsigned int* count_buf = (unsigned int*)buf.ptr(0);\r
+\r
+ cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));\r
+ count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);\r
+ count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+\r
+ unsigned int count;\r
+ cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));\r
+ \r
+ return count;\r
+ } \r
+\r
+ template int count_non_zero_caller_2steps<unsigned char>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller_2steps<signed char>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller_2steps<unsigned short>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller_2steps<signed short>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller_2steps<int>(const DevMem2D, PtrStep);\r
+ template int count_non_zero_caller_2steps<float>(const DevMem2D, PtrStep);\r
+\r
} // namespace countnonzero\r
\r
}}}\r
CV_EXPORTS void cv::gpu::getComputeCapability(int /*device*/, int& /*major*/, int& /*minor*/) { throw_nogpu(); } \r
CV_EXPORTS int cv::gpu::getNumberOfSMs(int /*device*/) { throw_nogpu(); return 0; } \r
CV_EXPORTS void cv::gpu::getGpuMemInfo(size_t& /*free*/, size_t& /*total*/) { throw_nogpu(); } \r
+CV_EXPORTS bool cv::gpu::hasNativeDoubleSupport(int /*device*/) { throw_nogpu(); return false; }\r
+CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int /*device*/) { throw_nogpu(); return false; }\r
\r
\r
#else /* !defined (HAVE_CUDA) */\r
cudaSafeCall( cudaMemGetInfo( &free, &total ) );\r
}\r
\r
+CV_EXPORTS bool cv::gpu::hasNativeDoubleSupport(int device)\r
+{\r
+ int major, minor;\r
+ getComputeCapability(device, major, minor);\r
+ return major > 1 || (major == 1 && minor >= 3);\r
+}\r
+\r
+CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device) \r
+{\r
+ int major, minor;\r
+ getComputeCapability(device, major, minor);\r
+ return major > 1 || (major == 1 && minor >= 1);\r
+}\r
+\r
#endif\r
\r
void run(int)\r
{\r
int depth_end;\r
- int major, minor;\r
- cv::gpu::getComputeCapability(getDevice(), major, minor);\r
-\r
- if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;\r
-\r
+ if (cv::gpu::hasNativeDoubleSupport(cv::gpu::getDevice())) depth_end = CV_64F; else depth_end = CV_32F;\r
for (int cn = 1; cn <= 4; ++cn)\r
for (int depth = CV_8U; depth <= depth_end; ++depth)\r
{\r
void run(int)\r
{\r
int depth_end;\r
- int major, minor;\r
- cv::gpu::getComputeCapability(getDevice(), major, minor);\r
-\r
- if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;\r
+ if (cv::gpu::hasNativeDoubleSupport(cv::gpu::getDevice())) depth_end = CV_64F; else depth_end = CV_32F;\r
for (int depth = CV_8U; depth <= depth_end; ++depth)\r
{\r
int rows = 1, cols = 3;\r
{\r
srand(0);\r
int depth_end;\r
- int major, minor;\r
- cv::gpu::getComputeCapability(getDevice(), major, minor);\r
-\r
- if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;\r
- for (int depth = CV_8U; depth <= depth_end; ++depth)\r
+ if (cv::gpu::hasNativeDoubleSupport(cv::gpu::getDevice())) depth_end = CV_64F; else depth_end = CV_32F;\r
+ for (int depth = CV_8U; depth <= CV_32F; ++depth)\r
{\r
for (int i = 0; i < 4; ++i)\r
{\r