used new device layer for cv::gpu::transpose

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 26 Aug 2013 06:43:08 +0000 (10:43 +0400)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Tue, 1 Oct 2013 08:18:37 +0000 (12:18 +0400)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 26 Aug 2013 06:43:08 +0000 (10:43 +0400)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Tue, 1 Oct 2013 08:18:37 +0000 (12:18 +0400)
diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp

index 0da1fcc..49cd57f 100644 (file)
--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@@ -64,52 +64,6 @@ void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int,
  #else /* !defined (HAVE_CUDA) */
  
  ////////////////////////////////////////////////////////////////////////
-// transpose
-
-namespace arithm
-{
-    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
-}
-
-void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
-{
-    GpuMat src = _src.getGpuMat();
-
-    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
-
-    _dst.create( src.cols, src.rows, src.type() );
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    if (src.elemSize() == 1)
-    {
-        NppStreamHandler h(stream);
-
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src.elemSize() == 4)
-    {
-        arithm::transpose<int>(src, dst, stream);
-    }
-    else // if (src.elemSize() == 8)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-
-        arithm::transpose<double>(src, dst, stream);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////
  // flip
  
  namespace
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu

index 6111b4b..aa85004 100644 (file)
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -40,83 +40,53 @@
  //
  //M*/
  
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
  
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
  
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#error "opencv_cudev is required"
  
-namespace arithm
-{
-    const int TRANSPOSE_TILE_DIM   = 16;
-    const int TRANSPOSE_BLOCK_ROWS = 16;
-
-    template <typename T>
-    __global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
-    {
-        __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+#else
  
-        int blockIdx_x, blockIdx_y;
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
  
-        // do diagonal reordering
-        if (gridDim.x == gridDim.y)
-        {
-            blockIdx_y = blockIdx.x;
-            blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
-        }
-        else
-        {
-            int bid = blockIdx.x + gridDim.x * blockIdx.y;
-            blockIdx_y = bid % gridDim.y;
-            blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
-        }
+using namespace cv::cudev;
  
-        int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
-        int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
-
-        if (xIndex < src.cols)
-        {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
-            {
-                if (yIndex + i < src.rows)
-                {
-                    tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
-                }
-            }
-        }
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
  
-        __syncthreads();
+    const size_t elemSize = src.elemSize();
  
-        xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
-        yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
  
-        if (xIndex < src.rows)
-        {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
-            {
-                if (yIndex + i < src.cols)
-                {
-                    dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
-                }
-            }
-        }
-    }
+    _dst.create( src.cols, src.rows, src.type() );
+    GpuMat dst = _dst.getGpuMat();
  
-    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+    if (elemSize == 1)
      {
-        const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        NppStreamHandler h(StreamAccessor::getStream(stream));
  
-        transposeKernel<<<grid, block, 0, stream>>>(src, dst);
-        cudaSafeCall( cudaGetLastError() );
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
  
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
  
-    template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
-    template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
+        if (!stream)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+    else if (elemSize == 4)
+    {
+        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
+    }
+    else // if (elemSize == 8)
+    {
+        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+    }
  }
  
-#endif // CUDA_DISABLER
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp

index 3854a17..83ee96b 100644 (file)
--- a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
@@ -55,15 +55,12 @@ namespace cv { namespace cudev {
  
  namespace transpose_detail
  {
-    const int TRANSPOSE_TILE_DIM   = 16;
-    const int TRANSPOSE_BLOCK_ROWS = 16;
-
-    template <class SrcPtr, typename DstType>
+    template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
      __global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
      {
          typedef typename PtrTraits<SrcPtr>::value_type src_type;
  
-        __shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+        __shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
  
          int blockIdx_x, blockIdx_y;
  
@@ -80,12 +77,12 @@ namespace transpose_detail
              blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
          }
  
-        int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
-        int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+        int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
+        int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
  
          if (xIndex < cols)
          {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
              {
                  if (yIndex + i < rows)
                  {
@@ -96,12 +93,12 @@ namespace transpose_detail
  
          __syncthreads();
  
-        xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
-        yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+        xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
+        yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
  
          if (xIndex < rows)
          {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
              {
                  if (yIndex + i < cols)
                  {
@@ -111,13 +108,13 @@ namespace transpose_detail
          }
      }
  
-    template <class SrcPtr, typename DstType>
+    template <class Policy, class SrcPtr, typename DstType>
      __host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
      {
-        const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+        const dim3 block(Policy::tile_dim, Policy::block_dim_y);
          const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  
-        transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols);
+        transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
          CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  
          if (stream == 0)
diff --git a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp

index 5e8f248..cf1bf83 100644 (file)
--- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
@@ -49,19 +49,53 @@
  #include "../common.hpp"
  #include "../ptr2d/traits.hpp"
  #include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
  #include "detail/transpose.hpp"
  
  namespace cv { namespace cudev {
  
-template <class SrcPtr, typename DstType>
-__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
  {
      const int rows = getRows(src);
      const int cols = getCols(src);
  
      dst.create(cols, rows);
  
-    transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
+
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultTransposePolicy
+{
+    enum {
+        tile_dim    = 16,
+        block_dim_y = 16
+    };
+};
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
  }
  
  }}
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 26 Aug 2013 06:43:08 +0000 (10:43 +0400)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Tue, 1 Oct 2013 08:18:37 +0000 (12:18 +0400)
modules/cudaarithm/src/core.cpp		patch \| blob \| history
modules/cudaarithm/src/cuda/transpose.cu		patch \| blob \| history
modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp		patch \| blob \| history
modules/cudev/include/opencv2/cudev/grid/transpose.hpp		patch \| blob \| history