From 7b3bbcea7159e058c2f2512b5c5b4a2454408fb1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 26 Aug 2013 10:43:08 +0400 Subject: [PATCH] used new device layer for cv::gpu::transpose --- modules/cudaarithm/src/core.cpp | 46 ---------- modules/cudaarithm/src/cuda/transpose.cu | 98 ++++++++-------------- .../opencv2/cudev/grid/detail/transpose.hpp | 25 +++--- .../cudev/include/opencv2/cudev/grid/transpose.hpp | 40 ++++++++- 4 files changed, 82 insertions(+), 127 deletions(-) diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp index 0da1fcc..49cd57f 100644 --- a/modules/cudaarithm/src/core.cpp +++ b/modules/cudaarithm/src/core.cpp @@ -64,52 +64,6 @@ void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int, #else /* !defined (HAVE_CUDA) */ //////////////////////////////////////////////////////////////////////// -// transpose - -namespace arithm -{ - template void transpose(PtrStepSz src, PtrStepSz dst, cudaStream_t stream); -} - -void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream) -{ - GpuMat src = _src.getGpuMat(); - - CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 ); - - _dst.create( src.cols, src.rows, src.type() ); - GpuMat dst = _dst.getGpuMat(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - if (src.elemSize() == 1) - { - NppStreamHandler h(stream); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - else if (src.elemSize() == 4) - { - arithm::transpose(src, dst, stream); - } - else // if (src.elemSize() == 8) - { - if (!deviceSupports(NATIVE_DOUBLE)) - CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); - - arithm::transpose(src, dst, stream); - } -} - -//////////////////////////////////////////////////////////////////////// // flip namespace diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 6111b4b..aa85004 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -40,83 +40,53 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" +#ifndef HAVE_OPENCV_CUDEV -using namespace cv::cuda; -using namespace cv::cuda::device; +#error "opencv_cudev is required" -namespace arithm -{ - const int TRANSPOSE_TILE_DIM = 16; - const int TRANSPOSE_BLOCK_ROWS = 16; - - template - __global__ void transposeKernel(const PtrStepSz src, PtrStep dst) - { - __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1]; +#else - int blockIdx_x, blockIdx_y; +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" - // do diagonal reordering - if (gridDim.x == gridDim.y) - { - blockIdx_y = blockIdx.x; - blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; - } - else - { - int bid = blockIdx.x + gridDim.x * blockIdx.y; - blockIdx_y = bid % gridDim.y; - blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; - } +using namespace cv::cudev; - int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x; - int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y; - - if (xIndex < src.cols) - { - for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) - { - if (yIndex + i < src.rows) - { - tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex); - } - } - } +void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); - __syncthreads(); + const size_t elemSize = src.elemSize(); - xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x; - yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y; + CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); - if (xIndex < src.rows) - { - for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) - { - if (yIndex + i < src.cols) - { - dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i]; - } - } - } - } + _dst.create( src.cols, src.rows, src.type() ); + GpuMat dst = _dst.getGpuMat(); - template void transpose(PtrStepSz src, PtrStepSz dst, cudaStream_t stream) + if (elemSize == 1) { - const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM); - const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + NppStreamHandler h(StreamAccessor::getStream(stream)); - transposeKernel<<>>(src, dst); - cudaSafeCall( cudaGetLastError() ); + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); - template void transpose(PtrStepSz src, PtrStepSz dst, cudaStream_t stream); - template void transpose(PtrStepSz src, PtrStepSz dst, cudaStream_t stream); + if (!stream) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } + else if (elemSize == 4) + { + gridTranspose(globPtr(src), globPtr(dst), stream); + } + else // if (elemSize == 8) + { + gridTranspose(globPtr(src), globPtr(dst), stream); + } } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp index 3854a17..83ee96b 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp @@ -55,15 +55,12 @@ namespace cv { namespace cudev { namespace transpose_detail { - const int TRANSPOSE_TILE_DIM = 16; - const int TRANSPOSE_BLOCK_ROWS = 16; - - template + template __global__ void transpose(const SrcPtr src, GlobPtr dst, const int rows, const int cols) { typedef typename PtrTraits::value_type src_type; - __shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1]; + __shared__ src_type tile[TILE_DIM][TILE_DIM + 1]; int blockIdx_x, blockIdx_y; @@ -80,12 +77,12 @@ namespace transpose_detail blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; } - int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x; - int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y; + int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; if (xIndex < cols) { - for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) + for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y) { if (yIndex + i < rows) { @@ -96,12 +93,12 @@ namespace transpose_detail __syncthreads(); - xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x; - yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y; + xIndex = blockIdx_y * TILE_DIM + threadIdx.x; + yIndex = blockIdx_x * TILE_DIM + threadIdx.y; if (xIndex < rows) { - for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) + for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y) { if (yIndex + i < cols) { @@ -111,13 +108,13 @@ namespace transpose_detail } } - template + template __host__ void transpose(const SrcPtr& src, const GlobPtr& dst, int rows, int cols, cudaStream_t stream) { - const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM); + const dim3 block(Policy::tile_dim, Policy::block_dim_y); const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); - transpose<<>>(src, dst, rows, cols); + transpose<<>>(src, dst, rows, cols); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) diff --git a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp index 5e8f248..cf1bf83 100644 --- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp @@ -49,19 +49,53 @@ #include "../common.hpp" #include "../ptr2d/traits.hpp" #include "../ptr2d/gpumat.hpp" +#include "../ptr2d/glob.hpp" #include "detail/transpose.hpp" namespace cv { namespace cudev { -template -__host__ void gridTranspose(const SrcPtr& src, GpuMat_& dst, Stream& stream = Stream::Null()) +template +__host__ void gridTranspose_(const SrcPtr& src, GpuMat_& dst, Stream& stream = Stream::Null()) { const int rows = getRows(src); const int cols = getCols(src); dst.create(cols, rows); - transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream)); + transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream)); +} + +template +__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src); + const int cols = getCols(src); + + CV_Assert( getRows(dst) == cols && getCols(dst) == rows ); + + transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream)); +} + +// Default Policy + +struct DefaultTransposePolicy +{ + enum { + tile_dim = 16, + block_dim_y = 16 + }; +}; + +template +__host__ void gridTranspose(const SrcPtr& src, GpuMat_& dst, Stream& stream = Stream::Null()) +{ + gridTranspose_(src, dst, stream); +} + +template +__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + gridTranspose_(src, dst, stream); } }} -- 2.7.4