reintegrate warp shuffle based integral

author marina.kolpakova <marina.kolpakova@itseez.com>

Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)

committer marina.kolpakova <marina.kolpakova@itseez.com>

Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)
author marina.kolpakova <marina.kolpakova@itseez.com>
Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)
committer marina.kolpakova <marina.kolpakova@itseez.com>
Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu

index a34a52a..09187fd 100644 (file)
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -361,14 +361,8 @@ namespace cv { namespace gpu { namespace device
          {
              {
                  // each thread handles 16 values, use 1 block/row
-                int block = img.cols / 16;
-
                  // save, becouse step is actually can't be less 512 bytes
-                int align = img.cols % 4;
-                if ( align != 0)
-                {
-                    block += (4 - align);
-                }
+                int block = integral.cols / 16;
  
                  // launch 1 block / row
                  const int grid = img.rows;
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp

index 81a2248..309b14a 100644 (file)
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -553,44 +553,25 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
  
      src.locateROI(whole, offset);
  
-    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x))
      {
-        GpuMat srcAlligned;
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
  
-        if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0)
-            srcAlligned = src;
-        else
-        {
-            ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer);
-
-            GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
-
-            if (s)
-            {
-                s.enqueueMemSet(buffer, Scalar::all(0));
-                s.enqueueCopy(src, inner);
-            }
-            else
-            {
-                buffer.setTo(Scalar::all(0));
-                src.copyTo(inner);
-            }
-
-            srcAlligned = buffer;
-        }
-
-        sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1);
+        cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream);
  
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
          if (s)
              s.enqueueMemSet(sum, Scalar::all(0));
          else
              sum.setTo(Scalar::all(0));
  
-        GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows));
-
-        cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream);
+        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
+        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
  
-            sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1));
+        if (s)
+            s.enqueueCopy(res, inner);
+        else
+            res.copyTo(inner);
      }
      else
      {
author	marina.kolpakova <marina.kolpakova@itseez.com>
	Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)
committer	marina.kolpakova <marina.kolpakova@itseez.com>
	Sat, 24 Nov 2012 23:21:51 +0000 (03:21 +0400)
modules/gpu/src/cuda/integral_image.cu		patch \| blob \| history
modules/gpu/src/imgproc.cpp		patch \| blob \| history