namespace cv { namespace gpu { namespace mathfunc\r
{\r
template <typename T>\r
- void sum_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+ void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
- void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+ void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);\r
\r
namespace sum\r
{\r
- void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);\r
+ void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);\r
}\r
}}}\r
\r
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) \r
{\r
using namespace mathfunc;\r
- CV_Assert(src.channels() == 1);\r
\r
- typedef void (*Caller)(const DevMem2D, PtrStep, double*);\r
+ typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);\r
static const Caller callers[2][7] = \r
{ { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, \r
sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, \r
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },\r
{ sum_caller<unsigned char>, sum_caller<char>, \r
sum_caller<unsigned short>, sum_caller<short>, \r
- sum_caller<int>, sum_caller<float>, sum_caller<double> } };\r
+ sum_caller<int>, sum_caller<float>, 0 } };\r
\r
Size bufSize;\r
- sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); \r
+ sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); \r
buf.create(bufSize, CV_8U);\r
\r
- Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];\r
+ Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];\r
if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");\r
\r
- double result;\r
- caller(src, buf, &result);\r
- return result;\r
+ double result[4];\r
+ caller(src, buf, result, src.channels());\r
+ return Scalar(result[0], result[1], result[2], result[3]);\r
}\r
\r
Scalar cv::gpu::sqrSum(const GpuMat& src) \r
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },\r
{ sqsum_caller<unsigned char>, sqsum_caller<char>, \r
sqsum_caller<unsigned short>, sqsum_caller<short>, \r
- sqsum_caller<int>, sqsum_caller<float>, sqsum_caller<double> } };\r
+ sqsum_caller<int>, sqsum_caller<float>, 0 } };\r
\r
Size bufSize;\r
- sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); \r
+ sum::get_buf_size_required(src.cols, src.rows, 1, bufSize.width, bufSize.height); \r
buf.create(bufSize, CV_8U);\r
\r
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];\r
\r
#include "opencv2/gpu/device/limits_gpu.hpp"\r
#include "opencv2/gpu/device/saturate_cast.hpp"\r
+#include "opencv2/gpu/device/vecmath.hpp"\r
#include "transform.hpp"\r
#include "internal_shared.hpp"\r
\r
}\r
\r
\r
- void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)\r
+ void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)\r
{\r
dim3 threads, grid;\r
estimate_thread_cfg(cols, rows, threads, grid);\r
- bufcols = grid.x * grid.y * sizeof(double);\r
+ bufcols = grid.x * grid.y * sizeof(double) * cn;\r
bufrows = 1;\r
}\r
\r
}\r
\r
template <typename T, typename R, typename Op, int nthreads>\r
- __global__ void sum_kernel(const DevMem2D_<T> src, R* result)\r
+ __global__ void sum_kernel(const DevMem2D src, R* result)\r
{\r
__shared__ R smem[nthreads];\r
\r
R sum = 0;\r
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
{\r
- const T* ptr = src.ptr(y0 + y * blockDim.y);\r
+ const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);\r
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
sum += Op::call(ptr[x0 + x * blockDim.x]);\r
}\r
result[0] = smem[0];\r
}\r
\r
+\r
+ template <typename T, typename R, typename Op, int nthreads>\r
+ __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)\r
+ {\r
+ typedef typename TypeVec<T, 2>::vec_t SrcType;\r
+ typedef typename TypeVec<R, 2>::vec_t DstType;\r
+\r
+ __shared__ R smem[nthreads * 2];\r
+\r
+ const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;\r
+ const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;\r
+\r
+ SrcType val;\r
+ DstType sum = VecTraits<DstType>::all(0);\r
+ for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)\r
+ {\r
+ const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);\r
+ for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
+ {\r
+ val = ptr[x0 + x * blockDim.x];\r
+ sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));\r
+ }\r
+ }\r
+\r
+ smem[tid] = sum.x;\r
+ smem[tid + nthreads] = sum.y;\r
+ __syncthreads();\r
+\r
+ sum_in_smem<nthreads, R>(smem, tid);\r
+ sum_in_smem<nthreads, R>(smem + nthreads, tid);\r
+\r
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
+\r
+ if (tid == 0)\r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[bid] = res;\r
+ __threadfence();\r
+\r
+ unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = (ticket == gridDim.x * gridDim.y - 1);\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ __syncthreads();\r
+\r
+ sum_in_smem<nthreads, R>(smem, tid);\r
+ sum_in_smem<nthreads, R>(smem + nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[0] = res;\r
+ blocks_finished = 0;\r
+ }\r
+ }\r
+#else\r
+ if (tid == 0) \r
+ {\r
+ DstType res;\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[bid] = res;\r
+ }\r
+#endif\r
+ }\r
+\r
+\r
+ template <typename T, typename R, int nthreads>\r
+ __global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)\r
+ {\r
+ typedef typename TypeVec<R, 2>::vec_t DstType;\r
+\r
+ __shared__ R smem[nthreads * 2];\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);\r
+ smem[tid] = res.x;\r
+ smem[tid + nthreads] = res.y;\r
+ __syncthreads();\r
+\r
+ sum_in_smem<nthreads, R>(smem, tid);\r
+ sum_in_smem<nthreads, R>(smem + nthreads, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ res.x = smem[0];\r
+ res.y = smem[nthreads];\r
+ result[0] = res;\r
+ }\r
+ }\r
+\r
} // namespace sum\r
\r
\r
template <typename T>\r
- void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+ void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
{\r
using namespace sum;\r
typedef typename SumType<T>::R R;\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- R* buf_ = (R*)buf.ptr(0);\r
-\r
- sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
- sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ case 2:\r
+ sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ }\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
- R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
- sum[0] = result;\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
} \r
\r
- template void sum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
- template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*);\r
- template void sum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
- template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*);\r
- template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*);\r
- template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);\r
+ template void sum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);\r
\r
\r
template <typename T>\r
- void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+ void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
{\r
using namespace sum;\r
typedef typename SumType<T>::R R;\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- R* buf_ = (R*)buf.ptr(0);\r
-\r
- sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
- sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ break;\r
+ }\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
- R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
- sum[0] = result;\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
} \r
\r
- template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);\r
+ template void sum_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);\r
\r
\r
template <typename T>\r
- void sum_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+ void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)\r
{\r
using namespace sum;\r
typedef typename SumType<T>::R R;\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- R* buf_ = (R*)buf.ptr(0);\r
-\r
- sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+ sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));\r
sum[0] = result;\r
} \r
\r
- template void sum_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<char>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<short>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<int>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<float>(const DevMem2D, PtrStep, double*);\r
- template void sum_caller<double>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);\r
\r
\r
template <typename T>\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- R* buf_ = (R*)buf.ptr(0);\r
-\r
- sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+ sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
+ cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));\r
sum[0] = result;\r
} \r
\r
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);\r
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);\r
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<double>(const DevMem2D, PtrStep, double*);\r
}}}\r
\r
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);\r
}\r
\r
+\r
+ static __device__ uint1 operator+(const uint1& a, const uint1& b)\r
+ {\r
+ return make_uint1(a.x + b.x);\r
+ }\r
+ static __device__ uint1 operator-(const uint1& a, const uint1& b)\r
+ {\r
+ return make_uint1(a.x - b.x);\r
+ }\r
+ static __device__ uint1 operator*(const uint1& a, const uint1& b)\r
+ {\r
+ return make_uint1(a.x * b.x);\r
+ }\r
+ static __device__ uint1 operator/(const uint1& a, const uint1& b)\r
+ {\r
+ return make_uint1(a.x / b.x);\r
+ }\r
+ static __device__ float1 operator*(const uint1& a, float s)\r
+ {\r
+ return make_float1(a.x * s);\r
+ }\r
+\r
+ static __device__ uint2 operator+(const uint2& a, const uint2& b)\r
+ {\r
+ return make_uint2(a.x + b.x, a.y + b.y);\r
+ }\r
+ static __device__ uint2 operator-(const uint2& a, const uint2& b)\r
+ {\r
+ return make_uint2(a.x - b.x, a.y - b.y);\r
+ }\r
+ static __device__ uint2 operator*(const uint2& a, const uint2& b)\r
+ {\r
+ return make_uint2(a.x * b.x, a.y * b.y);\r
+ }\r
+ static __device__ uint2 operator/(const uint2& a, const uint2& b)\r
+ {\r
+ return make_uint2(a.x / b.x, a.y / b.y);\r
+ }\r
+ static __device__ float2 operator*(const uint2& a, float s)\r
+ {\r
+ return make_float2(a.x * s, a.y * s);\r
+ }\r
+\r
+ static __device__ uint3 operator+(const uint3& a, const uint3& b)\r
+ {\r
+ return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);\r
+ }\r
+ static __device__ uint3 operator-(const uint3& a, const uint3& b)\r
+ {\r
+ return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);\r
+ }\r
+ static __device__ uint3 operator*(const uint3& a, const uint3& b)\r
+ {\r
+ return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);\r
+ }\r
+ static __device__ uint3 operator/(const uint3& a, const uint3& b)\r
+ {\r
+ return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);\r
+ }\r
+ static __device__ float3 operator*(const uint3& a, float s)\r
+ {\r
+ return make_float3(a.x * s, a.y * s, a.z * s);\r
+ }\r
+\r
+ static __device__ uint4 operator+(const uint4& a, const uint4& b)\r
+ {\r
+ return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);\r
+ }\r
+ static __device__ uint4 operator-(const uint4& a, const uint4& b)\r
+ {\r
+ return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);\r
+ }\r
+ static __device__ uint4 operator*(const uint4& a, const uint4& b)\r
+ {\r
+ return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);\r
+ }\r
+ static __device__ uint4 operator/(const uint4& a, const uint4& b)\r
+ {\r
+ return make_uint4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);\r
+ }\r
+ static __device__ float4 operator*(const uint4& a, float s)\r
+ {\r
+ return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);\r
+ }\r
+\r
static __device__ float1 operator+(const float1& a, const float1& b)\r
{\r
return make_float1(a.x + b.x);\r