added gpu::sqrSum function

author Alexey Spizhevoy <no@email>

Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)

committer Alexey Spizhevoy <no@email>

Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)
author Alexey Spizhevoy <no@email>
Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)
committer Alexey Spizhevoy <no@email>
Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp

index f0d4dd3..26f9d64 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -428,6 +428,14 @@ namespace cv
          //! supports only single channel images\r
          CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);\r
  \r
+        //! computes squared sum of array elements\r
+        //! supports only single channel images\r
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src);\r
+\r
+        //! computes squared sum of array elements\r
+        //! supports only single channel images\r
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);\r
+\r
          //! finds global minimum and maximum array elements and returns their values\r
          CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());\r
  \r
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp

index 049bfa4..5f7dd61 100644 (file)
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -66,6 +66,8 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return
  void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }\r
  Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }\r
  Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }\r
+Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }\r
+Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }\r
  void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }\r
  void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }\r
  void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }\r
@@ -489,6 +491,12 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);\r
  \r
+    template <typename T>\r
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+\r
+    template <typename T>\r
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);\r
+\r
      namespace sum\r
      {\r
          void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);\r
@@ -527,6 +535,38 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
      return result;\r
  }\r
  \r
+Scalar cv::gpu::sqrSum(const GpuMat& src) \r
+{\r
+    GpuMat buf;\r
+    return sqrSum(src, buf);\r
+}\r
+\r
+Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) \r
+{\r
+    using namespace mathfunc;\r
+    CV_Assert(src.channels() == 1);\r
+\r
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*);\r
+    static const Caller callers[2][7] = \r
+        { { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, \r
+            sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, \r
+            sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },\r
+          { sqsum_caller<unsigned char>, sqsum_caller<char>, \r
+            sqsum_caller<unsigned short>, sqsum_caller<short>, \r
+            sqsum_caller<int>, sqsum_caller<float>, sqsum_caller<double> } };\r
+\r
+    Size bufSize;\r
+    sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); \r
+    buf.create(bufSize, CV_8U);\r
+\r
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];\r
+    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");\r
+\r
+    double result;\r
+    caller(src, buf, &result);\r
+    return result;\r
+}\r
+\r
  ////////////////////////////////////////////////////////////////////////\r
  // minMax\r
  \r
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu

index 3d6e1f6..c990228 100644 (file)
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1428,6 +1428,12 @@ namespace cv { namespace gpu { namespace mathfunc
      template <> struct SumType<float> { typedef float R; };\r
      template <> struct SumType<double> { typedef double R; };\r
  \r
+    template <typename R> \r
+    struct IdentityOp { static __device__ R call(R x) { return x; } };\r
+\r
+    template <typename R> \r
+    struct SqrOp { static __device__ R call(R x) { return x * x; } };\r
+\r
      __constant__ int ctwidth;\r
      __constant__ int ctheight;\r
      __device__ unsigned int blocks_finished = 0;\r
@@ -1462,7 +1468,7 @@ namespace cv { namespace gpu { namespace mathfunc
          cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); \r
      }\r
  \r
-    template <typename T, typename R, int nthreads>\r
+    template <typename T, typename R, typename Op, int nthreads>\r
      __global__ void sum_kernel(const DevMem2D_<T> src, R* result)\r
      {\r
          __shared__ R smem[nthreads];\r
@@ -1477,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
          {\r
              const T* ptr = src.ptr(y0 + y * blockDim.y);\r
              for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)\r
-                sum += ptr[x0 + x * blockDim.x];\r
+                sum += Op::call(ptr[x0 + x * blockDim.x]);\r
          }\r
  \r
          smem[tid] = sum;\r
@@ -1548,9 +1554,8 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          R* buf_ = (R*)buf.ptr(0);\r
  \r
-        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
-        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(\r
-                buf_, grid.x * grid.y);\r
+        sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);\r
          cudaSafeCall(cudaThreadSynchronize());\r
  \r
          R result = 0;\r
@@ -1567,6 +1572,35 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
  \r
      template <typename T>\r
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+    {\r
+        using namespace sum;\r
+        typedef typename SumType<T>::R R;\r
+\r
+        dim3 threads, grid;\r
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
+        set_kernel_consts(src.cols, src.rows, threads, grid);\r
+\r
+        R* buf_ = (R*)buf.ptr(0);\r
+\r
+        sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);\r
+        cudaSafeCall(cudaThreadSynchronize());\r
+\r
+        R result = 0;\r
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
+        sum[0] = result;\r
+    }  \r
+\r
+    template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);\r
+\r
+\r
+    template <typename T>\r
      void sum_caller(const DevMem2D src, PtrStep buf, double* sum)\r
      {\r
          using namespace sum;\r
@@ -1578,7 +1612,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          R* buf_ = (R*)buf.ptr(0);\r
  \r
-        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+        sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
          cudaSafeCall(cudaThreadSynchronize());\r
  \r
          R result = 0;\r
@@ -1593,5 +1627,34 @@ namespace cv { namespace gpu { namespace mathfunc
      template void sum_caller<int>(const DevMem2D, PtrStep, double*);\r
      template void sum_caller<float>(const DevMem2D, PtrStep, double*);\r
      template void sum_caller<double>(const DevMem2D, PtrStep, double*);\r
+\r
+\r
+    template <typename T>\r
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum)\r
+    {\r
+        using namespace sum;\r
+        typedef typename SumType<T>::R R;\r
+\r
+        dim3 threads, grid;\r
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);\r
+        set_kernel_consts(src.cols, src.rows, threads, grid);\r
+\r
+        R* buf_ = (R*)buf.ptr(0);\r
+\r
+        sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);\r
+        cudaSafeCall(cudaThreadSynchronize());\r
+\r
+        R result = 0;\r
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));\r
+        sum[0] = result;\r
+    }  \r
+\r
+    template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<char>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);\r
+    template void sqsum_caller<double>(const DevMem2D, PtrStep, double*);\r
  }}}\r
  \r
diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp

index 521120c..abe76a1 100644 (file)
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -940,7 +940,7 @@ struct CV_GpuSumTest: CvTest
          {\r
              Mat src;\r
              Scalar a, b;\r
-            double max_err = 1e-6;\r
+            double max_err = 1e-5;\r
  \r
              int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;\r
              for (int type = CV_8U; type <= typemax; ++type) \r
@@ -954,6 +954,19 @@ struct CV_GpuSumTest: CvTest
                      ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
                      return;\r
                  }\r
+                if (type != CV_8S)\r
+                {\r
+                    b = sqrSum(GpuMat(src));\r
+                    Mat sqrsrc;\r
+                    multiply(src, src, sqrsrc);\r
+                    a = sum(sqrsrc);\r
+                    if (abs(a[0] - b[0]) > src.size().area() * max_err)\r
+                    {\r
+                        ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);\r
+                        ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);\r
+                        return;\r
+                    }\r
+                }\r
              }\r
          }\r
          catch (const Exception& e)\r
@@ -967,7 +980,7 @@ struct CV_GpuSumTest: CvTest
      {\r
          m.create(rows, cols, type);\r
          RNG rng;\r
-        rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20));\r
+        rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(16));\r
  \r
      }\r
  };\r
author	Alexey Spizhevoy <no@email>
	Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)
committer	Alexey Spizhevoy <no@email>
	Mon, 13 Dec 2010 14:34:02 +0000 (14:34 +0000)
modules/gpu/include/opencv2/gpu/gpu.hpp		patch \| blob \| history
modules/gpu/src/arithm.cpp		patch \| blob \| history
modules/gpu/src/cuda/mathfunc.cu		patch \| blob \| history
tests/gpu/src/arithm.cpp		patch \| blob \| history