void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
- void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+ void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
template <typename T>\r
- void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+ void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);\r
\r
namespace sum\r
{\r
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) \r
{\r
using namespace mathfunc;\r
- CV_Assert(src.channels() == 1);\r
\r
- typedef void (*Caller)(const DevMem2D, PtrStep, double*);\r
+ typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);\r
static const Caller callers[2][7] = \r
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, \r
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, \r
sqsum_caller<int>, sqsum_caller<float>, 0 } };\r
\r
Size bufSize;\r
- sum::get_buf_size_required(src.cols, src.rows, 1, bufSize.width, bufSize.height); \r
+ sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); \r
buf.create(bufSize, CV_8U);\r
\r
- Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];\r
+ Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];\r
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");\r
\r
- double result;\r
- caller(src, buf, &result);\r
- return result;\r
+ double result[4];\r
+ caller(src, buf, result, src.channels());\r
+ return Scalar(result[0], result[1], result[2], result[3]);\r
}\r
\r
////////////////////////////////////////////////////////////////////////\r
\r
\r
template <typename T>\r
- void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+ void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
{\r
using namespace sum;\r
typedef typename SumType<T>::R R;\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
- sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
- (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 2:\r
+ sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 3:\r
+ sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ case 4:\r
+ sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
+ (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);\r
+ break;\r
+ }\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
- R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));\r
- sum[0] = result;\r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
+\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
} \r
\r
- template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);\r
+ template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);\r
\r
\r
template <typename T>\r
- void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+ void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)\r
{\r
using namespace sum;\r
typedef typename SumType<T>::R R;\r
estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
set_kernel_consts(src.cols, src.rows, threads, grid);\r
\r
- sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
- src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ switch (cn)\r
+ {\r
+ case 1:\r
+ sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 2:\r
+ sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 3:\r
+ sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));\r
+ break;\r
+ case 4:\r
+ sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(\r
+ src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));\r
+ break;\r
+ }\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
- R result = 0;\r
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R), cudaMemcpyDeviceToHost));\r
- sum[0] = result;\r
- } \r
+ R result[4] = {0, 0, 0, 0};\r
+ cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));\r
\r
- template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<char>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);\r
- template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);\r
+ sum[0] = result[0];\r
+ sum[1] = result[1];\r
+ sum[2] = result[2];\r
+ sum[3] = result[3];\r
+ }\r
+\r
+ template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);\r
+ template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);\r
}}}\r
\r
+\r
}\r
if (type != CV_8S)\r
{\r
+ gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);\r
b = sqrSum(GpuMat(src));\r
Mat sqrsrc;\r
multiply(src, src, sqrsrc);\r
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
return;\r
}\r
+ gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 2), src);\r
+ b = sqrSum(GpuMat(src));\r
+ multiply(src, src, sqrsrc);\r
+ a = sum(sqrsrc);\r
+ if (abs(a[0] - b[0]) + abs(a[1] - b[1])> src.size().area() * max_err * 2)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ return;\r
+ }\r
+ gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 3), src);\r
+ b = sqrSum(GpuMat(src));\r
+ multiply(src, src, sqrsrc);\r
+ a = sum(sqrsrc);\r
+ if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2])> src.size().area() * max_err * 3)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ return;\r
+ }\r
+ gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 4), src);\r
+ b = sqrSum(GpuMat(src));\r
+ multiply(src, src, sqrsrc);\r
+ a = sum(sqrsrc);\r
+ if (abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2]) + abs(a[3] - b[3])> src.size().area() * max_err * 4)\r
+ {\r
+ ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);\r
+ ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+ return;\r
+ }\r
}\r
}\r
}\r