struct IdentityOp { static __device__ R call(R x) { return x; } };\r
\r
template <typename R> \r
+ struct AbsOp { static __device__ R call(R x) { return abs(x); } };\r
+\r
+ template <>\r
+ struct AbsOp<uint> { static __device__ uint call(uint x) { return x; } };\r
+\r
+ template <typename R> \r
struct SqrOp { static __device__ R call(R x) { return x * x; } };\r
\r
__constant__ int ctwidth;\r
\r
\r
template <typename T>\r
+ void absSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
+ {\r
+ using namespace sums;\r
+ typedef typename SumType<T>::R R;\r
+\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ }\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ } \r
+\r
+ template void absSumMultipassCaller<uchar>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumMultipassCaller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumMultipassCaller<ushort>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumMultipassCaller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumMultipassCaller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumMultipassCaller<float>(const DevMem2D, PtrStep, double*, int);\r
+\r
+\r
+ template <typename T>\r
+ void absSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
+ {\r
+ using namespace sums;\r
+ typedef typename SumType<T>::R R;\r
+\r
+ dim3 threads, grid;\r
+ estimateThreadCfg(src.cols, src.rows, threads, grid);\r
+ setKernelConsts(src.cols, src.rows, threads, grid);\r
+\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 3:\r
+ sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 4:\r
+ sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ break;\r
+ }\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ }\r
+\r
+ template void absSumCaller<uchar>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumCaller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumCaller<ushort>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumCaller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumCaller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void absSumCaller<float>(const DevMem2D, PtrStep, double*, int);\r
+\r
+\r
+ template <typename T>\r
void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
{\r
using namespace sums;\r
double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }\r
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }\r
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }\r
+Scalar cv::gpu::absSum(const GpuMat&) { throw_nogpu(); return Scalar(); }\r
+Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }\r
Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }\r
Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }\r
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }\r
void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
+ void absSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
+\r
+ template <typename T>\r
+ void absSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
+\r
+ template <typename T>\r
void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
\r
Size buf_size;\r
sums::getBufSizeRequired(src.cols, src.rows, src.channels(), \r
- buf_size.width, buf_size.height); \r
+ buf_size.width, buf_size.height); \r
ensureSizeIsEnough(buf_size, CV_8U, buf);\r
\r
Caller* callers = multipass_callers;\r
}\r
\r
\r
+Scalar cv::gpu::absSum(const GpuMat& src) \r
+{\r
+ GpuMat buf;\r
+ return absSum(src, buf);\r
+}\r
+\r
+\r
+Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) \r
+{\r
+ using namespace mathfunc;\r
+\r
+ typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);\r
+\r
+ static Caller multipass_callers[7] = { \r
+ absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, \r
+ absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, \r
+ absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 };\r
+\r
+ static Caller singlepass_callers[7] = { \r
+ absSumCaller<unsigned char>, absSumCaller<char>, \r
+ absSumCaller<unsigned short>, absSumCaller<short>, \r
+ absSumCaller<int>, absSumCaller<float>, 0 };\r
+\r
+ Size buf_size;\r
+ sums::getBufSizeRequired(src.cols, src.rows, src.channels(), \r
+ buf_size.width, buf_size.height); \r
+ ensureSizeIsEnough(buf_size, CV_8U, buf);\r
+\r
+ Caller* callers = multipass_callers;\r
+ if (TargetArchs::builtWith(ATOMICS) && DeviceInfo().has(ATOMICS))\r
+ callers = singlepass_callers;\r
+\r
+ Caller caller = callers[src.depth()];\r
+ if (!caller) CV_Error(CV_StsBadArg, "absSum: unsupported type");\r
+\r
+ double result[4];\r
+ caller(src, buf, result, src.channels());\r
+ return Scalar(result[0], result[1], result[2], result[3]);\r
+}\r
+\r
+\r
Scalar cv::gpu::sqrSum(const GpuMat& src) \r
{\r
GpuMat buf;\r
return Scalar(result[0], result[1], result[2], result[3]);\r
}\r
\r
+\r
+\r
+\r
////////////////////////////////////////////////////////////////////////\r
// Find min or max\r
\r
int typemax = CV_32F;\r
for (int type = CV_8U; type <= typemax; ++type) \r
{\r
+ //\r
+ // sum\r
+ //\r
+\r
gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 2), src);\r
a = sum(src);\r
b = sum(GpuMat(src));\r
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
return;\r
}\r
+\r
gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 3), src);\r
a = sum(src);\r
b = sum(GpuMat(src));\r
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
return;\r
}\r
+\r
gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 4), src);\r
a = sum(src);\r
b = sum(GpuMat(src));\r
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
return;\r
}\r
+\r
gen(1 + rand() % 500, 1 + rand() % 500, type, src);\r
a = sum(src);\r
b = sum(GpuMat(src));\r
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
return;\r
}\r
+\r
+ //\r
+ // absSum\r
+ //\r
+\r
+ gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);\r
+ b = absSum(GpuMat(src));\r
+ a = norm(src, NORM_L1);\r
+ if (abs(a[0] - b[0]) > src.size().area() * max_err)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ return;\r
+ }\r
+\r
+ //\r
+ // sqrSum\r
+ //\r
+\r
if (type != CV_8S)\r
{\r
gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);\r