namespace cv\r
{\r
namespace gpu\r
- { \r
+ {\r
//////////////////////////////// Initialization ////////////////////////\r
- \r
+\r
//! This is the only function that do not throw exceptions if the library is compiled without Cuda.\r
CV_EXPORTS int getCudaEnabledDeviceCount();\r
\r
//! Functions below throw cv::Expception if the library is compiled without Cuda.\r
CV_EXPORTS string getDeviceName(int device);\r
- CV_EXPORTS void setDevice(int device); \r
- CV_EXPORTS int getDevice(); \r
+ CV_EXPORTS void setDevice(int device);\r
+ CV_EXPORTS int getDevice();\r
\r
CV_EXPORTS void getComputeCapability(int device, int* major, int* minor);\r
CV_EXPORTS int getNumberOfSMs(int device);\r
- \r
+\r
//////////////////////////////// GpuMat ////////////////////////////////\r
- class CudaStrem;\r
+ class CudaStream;\r
\r
- //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. \r
+ //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.\r
class CV_EXPORTS GpuMat\r
{\r
public:\r
GpuMat(Size _size, int _type, const Scalar& _s);\r
//! copy constructor\r
GpuMat(const GpuMat& m);\r
- \r
+\r
//! constructor for GpuMatrix headers pointing to user-allocated data\r
GpuMat(int _rows, int _cols, int _type, void* _data, size_t _step = Mat::AUTO_STEP);\r
GpuMat(Size _size, int _type, void* _data, size_t _step = Mat::AUTO_STEP);\r
//! creates a matrix header for a part of the bigger matrix\r
GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);\r
GpuMat(const GpuMat& m, const Rect& roi);\r
- \r
+\r
//! builds GpuMat from Mat. Perfom blocking upload to device.\r
explicit GpuMat (const Mat& m);\r
\r
//! assignment operators\r
GpuMat& operator = (const GpuMat& m);\r
//! assignment operator. Perfom blocking upload to device.\r
- GpuMat& operator = (const Mat& m); \r
+ GpuMat& operator = (const Mat& m);\r
\r
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.\r
// Contains just image size, data ptr and step.\r
\r
//! Downloads data from device to host memory. Blocking calls.\r
operator Mat() const;\r
- void download(cv::Mat& m) const; \r
+ void download(cv::Mat& m) const;\r
\r
//! returns a new GpuMatrix header for the specified row\r
GpuMat row(int y) const;\r
//! extracts a rectangular sub-GpuMatrix\r
// (this is a generalized form of row, rowRange etc.)\r
GpuMat operator()( Range rowRange, Range colRange ) const;\r
- GpuMat operator()( const Rect& roi ) const; \r
+ GpuMat operator()( const Rect& roi ) const;\r
\r
//! returns true iff the GpuMatrix data is continuous\r
// (i.e. when there are no gaps between successive rows).\r
// Page locked memory is only needed for async and faster coping to GPU.\r
// It is convertable to cv::Mat header without reference counting\r
// so you can use it with other opencv functions.\r
- \r
+\r
class CV_EXPORTS MatPL\r
{\r
- public: \r
+ public:\r
\r
//Not supported. Now behaviour is like ALLOC_DEFAULT.\r
//enum { ALLOC_DEFAULT = 0, ALLOC_PORTABLE = 1, ALLOC_WRITE_COMBINED = 4 }\r
\r
- MatPL(); \r
- MatPL(const MatPL& m); \r
+ MatPL();\r
+ MatPL(const MatPL& m);\r
\r
MatPL(int _rows, int _cols, int _type);\r
- MatPL(Size _size, int _type); \r
+ MatPL(Size _size, int _type);\r
\r
//! creates from cv::Mat with coping data\r
explicit MatPL(const Mat& m);\r
- \r
- ~MatPL(); \r
+\r
+ ~MatPL();\r
\r
MatPL& operator = (const MatPL& m);\r
- \r
+\r
//! returns deep copy of the matrix, i.e. the data is copied\r
MatPL clone() const;\r
- \r
- //! allocates new matrix data unless the matrix already has specified size and type. \r
+\r
+ //! allocates new matrix data unless the matrix already has specified size and type.\r
void create(int _rows, int _cols, int _type);\r
- void create(Size _size, int _type); \r
+ void create(Size _size, int _type);\r
\r
//! decrements reference counter and released memory if needed.\r
void release();\r
//! returns matrix header with disabled reference counting for MatPL data.\r
Mat createMatHeader() const;\r
operator Mat() const;\r
- \r
+\r
// Please see cv::Mat for descriptions\r
- bool isContinuous() const; \r
- size_t elemSize() const; \r
- size_t elemSize1() const; \r
- int type() const; \r
- int depth() const; \r
- int channels() const; \r
- size_t step1() const; \r
- Size size() const; \r
+ bool isContinuous() const;\r
+ size_t elemSize() const;\r
+ size_t elemSize1() const;\r
+ int type() const;\r
+ int depth() const;\r
+ int channels() const;\r
+ size_t step1() const;\r
+ Size size() const;\r
bool empty() const;\r
- \r
+\r
// Please see cv::Mat for descriptions\r
- int flags; \r
- int rows, cols; \r
+ int flags;\r
+ int rows, cols;\r
size_t step;\r
\r
- uchar* data; \r
- int* refcount; \r
+ uchar* data;\r
+ int* refcount;\r
\r
uchar* datastart;\r
uchar* dataend;\r
class CV_EXPORTS CudaStream\r
{\r
public:\r
- CudaStream(); \r
+ CudaStream();\r
~CudaStream();\r
\r
- CudaStream(const CudaStream&); \r
+ CudaStream(const CudaStream&);\r
CudaStream& operator=(const CudaStream&);\r
\r
bool queryIfComplete();\r
- void waitForCompletion(); \r
+ void waitForCompletion();\r
\r
- //! downloads asynchronously. \r
+ //! downloads asynchronously.\r
// Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its subMat)\r
void enqueueDownload(const GpuMat& src, MatPL& dst);\r
void enqueueDownload(const GpuMat& src, Mat& dst);\r
\r
- //! uploads asynchronously. \r
+ //! uploads asynchronously.\r
// Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its ROI)\r
- void enqueueUpload(const MatPL& src, GpuMat& dst); \r
+ void enqueueUpload(const MatPL& src, GpuMat& dst);\r
void enqueueUpload(const Mat& src, GpuMat& dst);\r
\r
void enqueueCopy(const GpuMat& src, GpuMat& dst);\r
- \r
- void enqueueMemSet(const GpuMat& src, Scalar val); \r
+\r
+ void enqueueMemSet(const GpuMat& src, Scalar val);\r
void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);\r
\r
// converts matrix type, ex from float to uchar depending on type\r
- void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); \r
+ void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);\r
private:\r
void create();\r
void release();\r
struct Impl;\r
- Impl *impl; \r
+ Impl *impl;\r
friend struct StreamAccessor;\r
};\r
\r
//! Acync version\r
void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream);\r
\r
- //! Some heuristics that tries to estmate \r
+ //! Some heuristics that tries to estmate\r
// if current GPU will be faster then CPU in this algorithm.\r
// It queries current active device.\r
static bool checkIfGpuCallReasonable();\r
int ndisp;\r
int winSize;\r
int preset;\r
- \r
+\r
// If avergeTexThreshold == 0 => post procesing is disabled\r
// If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image\r
// SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold\r
- // i.e. input left image is low textured. \r
+ // i.e. input left image is low textured.\r
float avergeTexThreshold;\r
private:\r
GpuMat minSSD, leBuf, riBuf;\r
}\r
#include "opencv2/gpu/matrix_operations.hpp"\r
\r
-#endif /* __OPENCV_GPU_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_GPU_HPP__ */\r
#include <stddef.h>
#include <stdio.h>
-//#include <iostream>
#include "cuda_shared.hpp"
#include "cuda_runtime.h"
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
- typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels);
+ typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T>
- void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels)
+ void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
- ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
- ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
- cudaSafeCall ( cudaThreadSynchronize() );
+ if (stream == 0)
+ {
+ ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
+ ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+ cudaSafeCall ( cudaThreadSynchronize() );
+ }
+ else
+ {
+ ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+ ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+ }
}
- extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels)
+ extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
static CopyToFunc tab[8] =
{
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
- func(mat_src, mat_dst, mask, channels);
+ func(mat_src, mat_dst, mask, channels, stream);
}
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
- typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels);
- typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels);
+ typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+ typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
template <typename T>
- void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels)
+ void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
- ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
- cudaSafeCall ( cudaThreadSynchronize() );
+ if (stream == 0)
+ {
+ ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+ cudaSafeCall ( cudaThreadSynchronize() );
+ }
+ else
+ {
+ ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+ }
+
}
template <typename T>
- void set_to_without_mask_run(const DevMem2D& mat, int channels)
+ void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
- ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
- cudaSafeCall ( cudaThreadSynchronize() );
+ if (stream == 0)
+ {
+ ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+ cudaSafeCall ( cudaThreadSynchronize() );
+ }
+ else
+ {
+ ::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+ }
}
- extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels)
+ extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
{
double data[4];
data[0] = scalar[0];
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
- func(mat, channels);
+ func(mat, channels, stream);
}
- extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels)
+ extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
double data[4];
data[0] = scalar[0];
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
- func(mat, mask, channels);
+ func(mat, mask, channels, stream);
}
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
- typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta);
+ typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT>
- void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta)
+ void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
-
- ::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-
- cudaSafeCall( cudaThreadSynchronize() );
- }
-
- extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
+ if (stream == 0)
+ {
+ ::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+ cudaSafeCall( cudaThreadSynchronize() );
+ }
+ else
+ {
+ ::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+ }
+ }
+
+ extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{
static CvtFunc tab[8][8] =
{
CvtFunc func = tab[sdepth][ddepth];
if (func == 0)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
- func(src, dst, width, height, alpha, beta);
+ func(src, dst, width, height, alpha, beta, stream);
}
} // namespace impl
} // namespace gpu