Bug #4315 : fix CUDA bitwise operations with mask

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu

index 876d4ad..f606f0c 100644 (file)
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1896,53 +1896,53 @@ namespace cv { namespace gpu { namespace device
  
  namespace arithm
  {
-    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream)
      {
          if (mask.data)
-            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), SingleMaskChannels(mask, num_channels), stream);
          else
              transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
      }
  
-    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream)
      {
          if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), SingleMaskChannels(mask, num_channels), stream);
          else
              transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
      }
  
-    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream)
      {
          if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), SingleMaskChannels(mask, num_channels), stream);
          else
              transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
      }
  
-    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream)
      {
          if (mask.data)
-            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), SingleMaskChannels(mask, num_channels), stream);
          else
              transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
      }
  
-    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  
-    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  
-    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  
-    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  }
  
  //////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp

index 356b50a..354d614 100644 (file)
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -1955,7 +1955,7 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
  
  namespace arithm
  {
-    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  }
  
  void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s)
@@ -1964,39 +1964,73 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
  
      const int depth = src.depth();
  
-    CV_Assert( depth <= CV_64F );
+    CV_Assert( depth < CV_32F );
      CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
  
      dst.create(src.size(), src.type());
  
      cudaStream_t stream = StreamAccessor::getStream(s);
  
-    const int bcols = (int) (src.cols * src.elemSize());
-
-    if ((bcols & 3) == 0)
+    if (mask.empty())
      {
-        const int vcols = bcols >> 2;
+        const int bcols = (int) (src.cols * src.elemSize());
+        bool aligned =
+                isAligned(src.data, sizeof(unsigned int)) &&
+                isAligned(dst.data, sizeof(unsigned int));
  
-        bitMatNot<unsigned int>(
-                    PtrStepSzb(src.rows, vcols, src.data, src.step),
-                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
+        if (aligned && (bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
  
-        bitMatNot<unsigned short>(
-                    PtrStepSzb(src.rows, vcols, src.data, src.step),
-                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
-                    mask, stream);
+            bitMatNot<unsigned int>(
+                        PtrStepSzb(src.rows, vcols, src.data, src.step),
+                        PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else if (aligned && (bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            bitMatNot<unsigned short>(
+                        PtrStepSzb(src.rows, vcols, src.data, src.step),
+                        PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else
+        {
+            bitMatNot<unsigned char>(
+                        PtrStepSzb(src.rows, bcols, src.data, src.step),
+                        PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
      }
      else
      {
-        bitMatNot<unsigned char>(
-                    PtrStepSzb(src.rows, bcols, src.data, src.step),
-                    PtrStepSzb(src.rows, bcols, dst.data, dst.step),
-                    mask, stream);
+        const int elem_size = src.elemSize1();
+        const int num_channels = src.channels();
+        const int bcols = src.cols * num_channels;
+
+        if (elem_size == 1)
+        {
+            bitMatNot<unsigned char>(
+                        PtrStepSzb(src.rows, bcols, src.data, src.step),
+                        PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 2)
+        {
+            bitMatNot<unsigned short>(
+                        PtrStepSzb(src.rows, bcols, src.data, src.step),
+                        PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 4)
+        {
+            bitMatNot<unsigned int>(
+                        PtrStepSzb(src.rows, bcols, src.data, src.step),
+                        PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
      }
  }
  
@@ -2005,9 +2039,9 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
  
  namespace arithm
  {
-    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, int num_channels, cudaStream_t stream);
  }
  
  void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
@@ -2016,7 +2050,7 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
  
      const int depth = src1.depth();
  
-    CV_Assert( depth <= CV_64F );
+    CV_Assert( depth < CV_32F );
      CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
      CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
  
@@ -2024,36 +2058,73 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
  
      cudaStream_t stream = StreamAccessor::getStream(s);
  
-    const int bcols = (int) (src1.cols * src1.elemSize());
-
-    if ((bcols & 3) == 0)
+    if (mask.empty())
      {
-        const int vcols = bcols >> 2;
+        const int bcols = (int) (src1.cols * src1.elemSize());
+        bool aligned =
+                isAligned(src1.data, sizeof(unsigned int)) &&
+                isAligned(src2.data, sizeof(unsigned int)) &&
+                isAligned(dst.data, sizeof(unsigned int));
  
-        bitMatAnd<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
+        if (aligned && (bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
  
-        bitMatAnd<unsigned short>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
+            bitMatAnd<unsigned int>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else if (aligned && (bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            bitMatAnd<unsigned short>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else
+        {
+            bitMatAnd<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
      }
      else
      {
+        const int elem_size = src1.elemSize1();
+        const int num_channels = src1.channels();
+        const int bcols = src1.cols * num_channels;
  
-        bitMatAnd<unsigned char>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
+        if (elem_size == 1)
+        {
+            bitMatAnd<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 2)
+        {
+            bitMatAnd<unsigned short>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 4)
+        {
+            bitMatAnd<unsigned int>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
      }
  }
  
@@ -2063,7 +2134,7 @@ void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, co
  
      const int depth = src1.depth();
  
-    CV_Assert( depth <= CV_64F );
+    CV_Assert( depth < CV_32F );
      CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
      CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
  
@@ -2071,36 +2142,73 @@ void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, co
  
      cudaStream_t stream = StreamAccessor::getStream(s);
  
-    const int bcols = (int) (src1.cols * src1.elemSize());
-
-    if ((bcols & 3) == 0)
+    if (mask.empty())
      {
-        const int vcols = bcols >> 2;
+        const int bcols = (int) (src1.cols * src1.elemSize());
+        bool aligned =
+                isAligned(src1.data, sizeof(unsigned int)) &&
+                isAligned(src2.data, sizeof(unsigned int)) &&
+                isAligned(dst.data, sizeof(unsigned int));
  
-        bitMatOr<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
+        if (aligned && (bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
  
-        bitMatOr<unsigned short>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
+            bitMatOr<unsigned int>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else if (aligned && (bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            bitMatOr<unsigned short>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else
+        {
+            bitMatOr<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
      }
      else
      {
+        const int elem_size = src1.elemSize1();
+        const int num_channels = src1.channels();
+        const int bcols = src1.cols * num_channels;
  
-        bitMatOr<unsigned char>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
+        if (elem_size == 1)
+        {
+            bitMatOr<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 2)
+        {
+            bitMatOr<unsigned short>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 4)
+        {
+            bitMatOr<unsigned int>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
      }
  }
  
@@ -2110,7 +2218,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
  
      const int depth = src1.depth();
  
-    CV_Assert( depth <= CV_64F );
+    CV_Assert( depth < CV_32F );
      CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
      CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
  
@@ -2118,36 +2226,73 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
  
      cudaStream_t stream = StreamAccessor::getStream(s);
  
-    const int bcols = (int) (src1.cols * src1.elemSize());
-
-    if ((bcols & 3) == 0)
+    if (mask.empty())
      {
-        const int vcols = bcols >> 2;
+        const int bcols = (int) (src1.cols * src1.elemSize());
+        bool aligned =
+                isAligned(src1.data, sizeof(unsigned int)) &&
+                isAligned(src2.data, sizeof(unsigned int)) &&
+                isAligned(dst.data, sizeof(unsigned int));
  
-        bitMatXor<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
+        if (aligned && (bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
  
-        bitMatXor<unsigned short>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
+            bitMatXor<unsigned int>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else if (aligned && (bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            bitMatXor<unsigned short>(
+                        PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
+        else
+        {
+            bitMatXor<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        PtrStepb(), 1, stream);
+        }
      }
      else
      {
+        const int elem_size = src1.elemSize1();
+        const int num_channels = src1.channels();
+        const int bcols = src1.cols * num_channels;
  
-        bitMatXor<unsigned char>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
+        if (elem_size == 1)
+        {
+            bitMatXor<unsigned char>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 2)
+        {
+            bitMatXor<unsigned short>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
+        else if (elem_size == 4)
+        {
+            bitMatXor<unsigned int>(
+                        PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                        PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                        PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                        mask, num_channels, stream);
+        }
      }
  }
  
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp

index 7ceeaed..b8b83ef 100644 (file)
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1785,72 +1785,95 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
  //////////////////////////////////////////////////////////////////////////////
  // Bitwise_Array
  
-PARAM_TEST_CASE(Bitwise_Array, cv::gpu::DeviceInfo, cv::Size, MatType)
+PARAM_TEST_CASE(Bitwise_Array, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
  {
      cv::gpu::DeviceInfo devInfo;
      cv::Size size;
      int type;
+    bool useRoi;
  
      cv::Mat src1;
      cv::Mat src2;
  
+    cv::Mat mask;
+
      virtual void SetUp()
      {
          devInfo = GET_PARAM(0);
          size = GET_PARAM(1);
          type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
  
          cv::gpu::setDevice(devInfo.deviceID());
  
          src1 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
          src2 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
+
+        mask = randomMat(size, CV_8UC1, 0.0, 2.0);
      }
  };
  
  GPU_TEST_P(Bitwise_Array, Not)
  {
-    cv::gpu::GpuMat dst;
-    cv::gpu::bitwise_not(loadMat(src1), dst);
+    cv::gpu::GpuMat dst_nomask, dst_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::gpu::bitwise_not(loadMat(src1, useRoi), dst_nomask);
+    cv::gpu::bitwise_not(loadMat(src1, useRoi), dst_mask, loadMat(mask, useRoi));
  
-    cv::Mat dst_gold = ~src1;
+    cv::Mat dst_gold_nomask, dst_gold_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::bitwise_not(src1, dst_gold_nomask);
+    cv::bitwise_not(src1, dst_gold_mask, mask);
  
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_nomask, dst_nomask, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_mask, dst_mask, 0.0);
  }
  
  GPU_TEST_P(Bitwise_Array, Or)
  {
-    cv::gpu::GpuMat dst;
-    cv::gpu::bitwise_or(loadMat(src1), loadMat(src2), dst);
+    cv::gpu::GpuMat dst_nomask, dst_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::gpu::bitwise_or(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_nomask);
+    cv::gpu::bitwise_or(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_mask, loadMat(mask, useRoi));
  
-    cv::Mat dst_gold = src1 | src2;
+    cv::Mat dst_gold_nomask, dst_gold_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::bitwise_or(src1, src2, dst_gold_nomask);
+    cv::bitwise_or(src1, src2, dst_gold_mask, mask);
  
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_nomask, dst_nomask, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_mask, dst_mask, 0.0);
  }
  
  GPU_TEST_P(Bitwise_Array, And)
  {
-    cv::gpu::GpuMat dst;
-    cv::gpu::bitwise_and(loadMat(src1), loadMat(src2), dst);
+    cv::gpu::GpuMat dst_nomask, dst_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::gpu::bitwise_and(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_nomask);
+    cv::gpu::bitwise_and(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_mask, loadMat(mask, useRoi));
  
-    cv::Mat dst_gold = src1 & src2;
+    cv::Mat dst_gold_nomask, dst_gold_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::bitwise_and(src1, src2, dst_gold_nomask);
+    cv::bitwise_and(src1, src2, dst_gold_mask, mask);
  
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_nomask, dst_nomask, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_mask, dst_mask, 0.0);
  }
  
  GPU_TEST_P(Bitwise_Array, Xor)
  {
-    cv::gpu::GpuMat dst;
-    cv::gpu::bitwise_xor(loadMat(src1), loadMat(src2), dst);
+    cv::gpu::GpuMat dst_nomask, dst_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::gpu::bitwise_xor(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_nomask);
+    cv::gpu::bitwise_xor(loadMat(src1, useRoi), loadMat(src2, useRoi), dst_mask, loadMat(mask, useRoi));
  
-    cv::Mat dst_gold = src1 ^ src2;
+    cv::Mat dst_gold_nomask, dst_gold_mask(src1.size(), src1.type(), cv::Scalar::all(0));
+    cv::bitwise_xor(src1, src2, dst_gold_nomask);
+    cv::bitwise_xor(src1, src2, dst_gold_mask, mask);
  
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_nomask, dst_nomask, 0.0);
+    EXPECT_MAT_NEAR(dst_gold_mask, dst_mask, 0.0);
  }
  
  INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Array, testing::Combine(
      ALL_DEVICES,
      DIFFERENT_SIZES,
-    TYPES(CV_8U, CV_32S, 1, 4)));
+    TYPES(CV_8U, CV_32S, 1, 4),
+    WHOLE_SUBMAT));
  
  //////////////////////////////////////////////////////////////////////////////
  // Bitwise_Scalar
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Wed, 6 May 2015 10:28:08 +0000 (13:28 +0300)
modules/gpu/src/cuda/element_operations.cu		patch \| blob \| history
modules/gpu/src/element_operations.cpp		patch \| blob \| history
modules/gpu/test/test_core.cpp		patch \| blob \| history