#else /* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
-// transpose
-
-namespace arithm
-{
- template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
-}
-
-void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
-{
- GpuMat src = _src.getGpuMat();
-
- CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
-
- _dst.create( src.cols, src.rows, src.type() );
- GpuMat dst = _dst.getGpuMat();
-
- cudaStream_t stream = StreamAccessor::getStream(_stream);
-
- if (src.elemSize() == 1)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
- dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- else if (src.elemSize() == 4)
- {
- arithm::transpose<int>(src, dst, stream);
- }
- else // if (src.elemSize() == 8)
- {
- if (!deviceSupports(NATIVE_DOUBLE))
- CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-
- arithm::transpose<double>(src, dst, stream);
- }
-}
-
-////////////////////////////////////////////////////////////////////////
// flip
namespace
//
//M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#error "opencv_cudev is required"
-namespace arithm
-{
- const int TRANSPOSE_TILE_DIM = 16;
- const int TRANSPOSE_BLOCK_ROWS = 16;
-
- template <typename T>
- __global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
- {
- __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+#else
- int blockIdx_x, blockIdx_y;
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
- // do diagonal reordering
- if (gridDim.x == gridDim.y)
- {
- blockIdx_y = blockIdx.x;
- blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
- }
- else
- {
- int bid = blockIdx.x + gridDim.x * blockIdx.y;
- blockIdx_y = bid % gridDim.y;
- blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
- }
+using namespace cv::cudev;
- int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
- int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
-
- if (xIndex < src.cols)
- {
- for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
- {
- if (yIndex + i < src.rows)
- {
- tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
- }
- }
- }
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+{
+ GpuMat src = _src.getGpuMat();
- __syncthreads();
+ const size_t elemSize = src.elemSize();
- xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
- yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+ CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
- if (xIndex < src.rows)
- {
- for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
- {
- if (yIndex + i < src.cols)
- {
- dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
- }
- }
- }
- }
+ _dst.create( src.cols, src.rows, src.type() );
+ GpuMat dst = _dst.getGpuMat();
- template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+ if (elemSize == 1)
{
- const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
- const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+ NppStreamHandler h(StreamAccessor::getStream(stream));
- transposeKernel<<<grid, block, 0, stream>>>(src, dst);
- cudaSafeCall( cudaGetLastError() );
+ NppiSize sz;
+ sz.width = src.cols;
+ sz.height = src.rows;
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+ dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
- template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
- template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
+ if (!stream)
+ CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+ }
+ else if (elemSize == 4)
+ {
+ gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
+ }
+ else // if (elemSize == 8)
+ {
+ gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+ }
}
-#endif // CUDA_DISABLER
+#endif
namespace transpose_detail
{
- const int TRANSPOSE_TILE_DIM = 16;
- const int TRANSPOSE_BLOCK_ROWS = 16;
-
- template <class SrcPtr, typename DstType>
+ template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
__global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
{
typedef typename PtrTraits<SrcPtr>::value_type src_type;
- __shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+ __shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_x, blockIdx_y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
- int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
- int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+ int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
+ int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
if (xIndex < cols)
{
- for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+ for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{
if (yIndex + i < rows)
{
__syncthreads();
- xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
- yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+ xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
+ yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
if (xIndex < rows)
{
- for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+ for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{
if (yIndex + i < cols)
{
}
}
- template <class SrcPtr, typename DstType>
+ template <class Policy, class SrcPtr, typename DstType>
__host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
{
- const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+ const dim3 block(Policy::tile_dim, Policy::block_dim_y);
const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
- transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols);
+ transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0)
#include "../common.hpp"
#include "../ptr2d/traits.hpp"
#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
#include "detail/transpose.hpp"
namespace cv { namespace cudev {
-template <class SrcPtr, typename DstType>
-__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
dst.create(cols, rows);
- transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+ transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+ const int rows = getRows(src);
+ const int cols = getCols(src);
+
+ CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
+
+ transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultTransposePolicy
+{
+ enum {
+ tile_dim = 16,
+ block_dim_y = 16
+ };
+};
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+ gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+ gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
}
}}