1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
44 #ifndef __OPENCV_CORE_CUDA_HPP__
45 #define __OPENCV_CORE_CUDA_HPP__
48 # error cuda.hpp header must be compiled as C++
51 #include "opencv2/core.hpp"
52 #include "opencv2/core/cuda_types.hpp"
54 namespace cv { namespace cuda {
56 //////////////////////////////// GpuMat ///////////////////////////////
58 // Smart pointer for GPU memory with reference counting.
59 // Its interface is mostly similar with cv::Mat.
61 class CV_EXPORTS GpuMat
64 class CV_EXPORTS Allocator
67 virtual ~Allocator() {}
69 // allocator must fill data, step and refcount fields
70 virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0;
71 virtual void free(GpuMat* mat) = 0;
75 static Allocator* defaultAllocator();
76 static void setDefaultAllocator(Allocator* allocator);
78 //! default constructor
79 explicit GpuMat(Allocator* allocator = defaultAllocator());
81 //! constructs GpuMat of the specified size and type
82 GpuMat(int rows, int cols, int type, Allocator* allocator = defaultAllocator());
83 GpuMat(Size size, int type, Allocator* allocator = defaultAllocator());
85 //! constucts GpuMat and fills it with the specified value _s
86 GpuMat(int rows, int cols, int type, Scalar s, Allocator* allocator = defaultAllocator());
87 GpuMat(Size size, int type, Scalar s, Allocator* allocator = defaultAllocator());
90 GpuMat(const GpuMat& m);
92 //! constructor for GpuMat headers pointing to user-allocated data
93 GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
94 GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
96 //! creates a GpuMat header for a part of the bigger matrix
97 GpuMat(const GpuMat& m, Range rowRange, Range colRange);
98 GpuMat(const GpuMat& m, Rect roi);
100 //! builds GpuMat from host memory (Blocking call)
101 explicit GpuMat(InputArray arr, Allocator* allocator = defaultAllocator());
103 //! destructor - calls release()
106 //! assignment operators
107 GpuMat& operator =(const GpuMat& m);
109 //! allocates new GpuMat data unless the GpuMat already has specified size and type
110 void create(int rows, int cols, int type);
111 void create(Size size, int type);
113 //! decreases reference counter, deallocate the data when reference counter reaches 0
116 //! swaps with other smart pointer
117 void swap(GpuMat& mat);
119 //! pefroms upload data to GpuMat (Blocking call)
120 void upload(InputArray arr);
122 //! pefroms upload data to GpuMat (Non-Blocking call)
123 void upload(InputArray arr, Stream& stream);
125 //! pefroms download data from device to host memory (Blocking call)
126 void download(OutputArray dst) const;
128 //! pefroms download data from device to host memory (Non-Blocking call)
129 void download(OutputArray dst, Stream& stream) const;
131 //! returns deep copy of the GpuMat, i.e. the data is copied
132 GpuMat clone() const;
134 //! copies the GpuMat content to device memory (Blocking call)
135 void copyTo(OutputArray dst) const;
137 //! copies the GpuMat content to device memory (Non-Blocking call)
138 void copyTo(OutputArray dst, Stream& stream) const;
140 //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
141 void copyTo(OutputArray dst, InputArray mask) const;
143 //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
144 void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
146 //! sets some of the GpuMat elements to s (Blocking call)
147 GpuMat& setTo(Scalar s);
149 //! sets some of the GpuMat elements to s (Non-Blocking call)
150 GpuMat& setTo(Scalar s, Stream& stream);
152 //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
153 GpuMat& setTo(Scalar s, InputArray mask);
155 //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
156 GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
158 //! converts GpuMat to another datatype (Blocking call)
159 void convertTo(OutputArray dst, int rtype) const;
161 //! converts GpuMat to another datatype (Non-Blocking call)
162 void convertTo(OutputArray dst, int rtype, Stream& stream) const;
164 //! converts GpuMat to another datatype with scaling (Blocking call)
165 void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
167 //! converts GpuMat to another datatype with scaling (Non-Blocking call)
168 void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
170 //! converts GpuMat to another datatype with scaling (Non-Blocking call)
171 void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
173 void assignTo(GpuMat& m, int type=-1) const;
175 //! returns pointer to y-th row
176 uchar* ptr(int y = 0);
177 const uchar* ptr(int y = 0) const;
179 //! template version of the above method
180 template<typename _Tp> _Tp* ptr(int y = 0);
181 template<typename _Tp> const _Tp* ptr(int y = 0) const;
183 template <typename _Tp> operator PtrStepSz<_Tp>() const;
184 template <typename _Tp> operator PtrStep<_Tp>() const;
186 //! returns a new GpuMat header for the specified row
187 GpuMat row(int y) const;
189 //! returns a new GpuMat header for the specified column
190 GpuMat col(int x) const;
192 //! ... for the specified row span
193 GpuMat rowRange(int startrow, int endrow) const;
194 GpuMat rowRange(Range r) const;
196 //! ... for the specified column span
197 GpuMat colRange(int startcol, int endcol) const;
198 GpuMat colRange(Range r) const;
200 //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
201 GpuMat operator ()(Range rowRange, Range colRange) const;
202 GpuMat operator ()(Rect roi) const;
204 //! creates alternative GpuMat header for the same data, with different
205 //! number of channels and/or different number of rows
206 GpuMat reshape(int cn, int rows = 0) const;
208 //! locates GpuMat header within a parent GpuMat
209 void locateROI(Size& wholeSize, Point& ofs) const;
211 //! moves/resizes the current GpuMat ROI inside the parent GpuMat
212 GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
214 //! returns true iff the GpuMat data is continuous
215 //! (i.e. when there are no gaps between successive rows)
216 bool isContinuous() const;
218 //! returns element size in bytes
219 size_t elemSize() const;
221 //! returns the size of element channel in bytes
222 size_t elemSize1() const;
224 //! returns element type
227 //! returns element type
230 //! returns number of channels
231 int channels() const;
233 //! returns step/elemSize1()
234 size_t step1() const;
236 //! returns GpuMat size : width == number of columns, height == number of rows
239 //! returns true if GpuMat data is NULL
242 /*! includes several bit-fields:
243 - the magic signature
250 //! the number of rows and columns
253 //! a distance between successive rows in bytes; includes the gap if any
256 //! pointer to the data
259 //! pointer to the reference counter;
260 //! when GpuMat points to user-allocated data, the pointer is NULL
263 //! helper fields used in locateROI and adjustROI
264 const uchar* datastart;
265 const uchar* dataend;
268 Allocator* allocator;
271 //! creates continuous matrix
272 CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);
274 //! ensures that size of the given matrix is not less than (rows, cols) size
275 //! and matrix type is match specified one too
276 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
278 CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
280 //! BufferPool management (must be called before Stream creation)
281 CV_EXPORTS void setBufferPoolUsage(bool on);
282 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
284 //////////////////////////////// CudaMem ////////////////////////////////
286 // CudaMem is limited cv::Mat with page locked memory allocation.
287 // Page locked memory is only needed for async and faster coping to GPU.
288 // It is convertable to cv::Mat header without reference counting
289 // so you can use it with other opencv functions.
291 class CV_EXPORTS CudaMem
294 enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
296 explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
298 CudaMem(const CudaMem& m);
300 CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
301 CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
303 //! creates from host memory with coping data
304 explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
308 CudaMem& operator =(const CudaMem& m);
310 //! swaps with other smart pointer
311 void swap(CudaMem& b);
313 //! returns deep copy of the matrix, i.e. the data is copied
314 CudaMem clone() const;
316 //! allocates new matrix data unless the matrix already has specified size and type.
317 void create(int rows, int cols, int type);
318 void create(Size size, int type);
320 //! creates alternative CudaMem header for the same data, with different
321 //! number of channels and/or different number of rows
322 CudaMem reshape(int cn, int rows = 0) const;
324 //! decrements reference counter and released memory if needed.
327 //! returns matrix header with disabled reference counting for CudaMem data.
328 Mat createMatHeader() const;
330 //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
331 GpuMat createGpuMatHeader() const;
333 // Please see cv::Mat for descriptions
334 bool isContinuous() const;
335 size_t elemSize() const;
336 size_t elemSize1() const;
339 int channels() const;
340 size_t step1() const;
344 // Please see cv::Mat for descriptions
352 const uchar* datastart;
353 const uchar* dataend;
355 AllocType alloc_type;
358 //! page-locks the matrix m memory and maps it for the device(s)
359 CV_EXPORTS void registerPageLocked(Mat& m);
361 //! unmaps the memory of matrix m, and makes it pageable again
362 CV_EXPORTS void unregisterPageLocked(Mat& m);
364 ///////////////////////////////// Stream //////////////////////////////////
366 // Encapculates Cuda Stream. Provides interface for async coping.
367 // Passed to each function that supports async kernel execution.
368 // Reference counting is enabled.
370 class CV_EXPORTS Stream
372 typedef void (Stream::*bool_type)() const;
373 void this_type_does_not_support_comparisons() const {}
376 typedef void (*StreamCallback)(int status, void* userData);
378 //! creates a new asynchronous stream
381 //! queries an asynchronous stream for completion status
382 bool queryIfComplete() const;
384 //! waits for stream tasks to complete
385 void waitForCompletion();
387 //! makes a compute stream wait on an event
388 void waitEvent(const Event& event);
390 //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
391 void enqueueHostCallback(StreamCallback callback, void* userData);
393 //! return Stream object for default CUDA stream
394 static Stream& Null();
396 //! returns true if stream object is not default (!= 0)
397 operator bool_type() const;
403 Stream(const Ptr<Impl>& impl);
405 friend struct StreamAccessor;
406 friend class BufferPool;
409 class CV_EXPORTS Event
414 DEFAULT = 0x00, /**< Default event flag */
415 BLOCKING_SYNC = 0x01, /**< Event uses blocking synchronization */
416 DISABLE_TIMING = 0x02, /**< Event will not record timing data */
417 INTERPROCESS = 0x04 /**< Event is suitable for interprocess use. DisableTiming must be set */
420 explicit Event(CreateFlags flags = DEFAULT);
423 void record(Stream& stream = Stream::Null());
425 //! queries an event's status
426 bool queryIfComplete() const;
428 //! waits for an event to complete
429 void waitForCompletion();
431 //! computes the elapsed time between events
432 static float elapsedTime(const Event& start, const Event& end);
439 friend struct EventAccessor;
442 //////////////////////////////// Initialization & Info ////////////////////////
444 //! this is the only function that do not throw exceptions if the library is compiled without CUDA
445 CV_EXPORTS int getCudaEnabledDeviceCount();
447 //! set device to be used for GPU executions for the calling host thread
448 CV_EXPORTS void setDevice(int device);
450 //! returns which device is currently being used for the calling host thread
451 CV_EXPORTS int getDevice();
453 //! explicitly destroys and cleans up all resources associated with the current device in the current process
454 //! any subsequent API call to this device will reinitialize the device
455 CV_EXPORTS void resetDevice();
459 FEATURE_SET_COMPUTE_10 = 10,
460 FEATURE_SET_COMPUTE_11 = 11,
461 FEATURE_SET_COMPUTE_12 = 12,
462 FEATURE_SET_COMPUTE_13 = 13,
463 FEATURE_SET_COMPUTE_20 = 20,
464 FEATURE_SET_COMPUTE_21 = 21,
465 FEATURE_SET_COMPUTE_30 = 30,
466 FEATURE_SET_COMPUTE_35 = 35,
468 GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
469 SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
470 NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
471 WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
472 DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
475 //! checks whether current device supports the given feature
476 CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
478 //! information about what GPU archs this OpenCV CUDA module was compiled for
479 class CV_EXPORTS TargetArchs
482 static bool builtWith(FeatureSet feature_set);
484 static bool has(int major, int minor);
485 static bool hasPtx(int major, int minor);
486 static bool hasBin(int major, int minor);
488 static bool hasEqualOrLessPtx(int major, int minor);
489 static bool hasEqualOrGreater(int major, int minor);
490 static bool hasEqualOrGreaterPtx(int major, int minor);
491 static bool hasEqualOrGreaterBin(int major, int minor);
494 //! information about the given GPU.
495 class CV_EXPORTS DeviceInfo
498 //! creates DeviceInfo object for the current GPU
501 //! creates DeviceInfo object for the given GPU
502 DeviceInfo(int device_id);
505 int deviceID() const;
507 //! ASCII string identifying device
508 const char* name() const;
510 //! global memory available on device in bytes
511 size_t totalGlobalMem() const;
513 //! shared memory available per block in bytes
514 size_t sharedMemPerBlock() const;
516 //! 32-bit registers available per block
517 int regsPerBlock() const;
519 //! warp size in threads
520 int warpSize() const;
522 //! maximum pitch in bytes allowed by memory copies
523 size_t memPitch() const;
525 //! maximum number of threads per block
526 int maxThreadsPerBlock() const;
528 //! maximum size of each dimension of a block
529 Vec3i maxThreadsDim() const;
531 //! maximum size of each dimension of a grid
532 Vec3i maxGridSize() const;
534 //! clock frequency in kilohertz
535 int clockRate() const;
537 //! constant memory available on device in bytes
538 size_t totalConstMem() const;
540 //! major compute capability
541 int majorVersion() const;
543 //! minor compute capability
544 int minorVersion() const;
546 //! alignment requirement for textures
547 size_t textureAlignment() const;
549 //! pitch alignment requirement for texture references bound to pitched memory
550 size_t texturePitchAlignment() const;
552 //! number of multiprocessors on device
553 int multiProcessorCount() const;
555 //! specified whether there is a run time limit on kernels
556 bool kernelExecTimeoutEnabled() const;
558 //! device is integrated as opposed to discrete
559 bool integrated() const;
561 //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
562 bool canMapHostMemory() const;
566 ComputeModeDefault, /**< default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
567 ComputeModeExclusive, /**< compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
568 ComputeModeProhibited, /**< compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
569 ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
573 ComputeMode computeMode() const;
575 //! maximum 1D texture size
576 int maxTexture1D() const;
578 //! maximum 1D mipmapped texture size
579 int maxTexture1DMipmap() const;
581 //! maximum size for 1D textures bound to linear memory
582 int maxTexture1DLinear() const;
584 //! maximum 2D texture dimensions
585 Vec2i maxTexture2D() const;
587 //! maximum 2D mipmapped texture dimensions
588 Vec2i maxTexture2DMipmap() const;
590 //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
591 Vec3i maxTexture2DLinear() const;
593 //! maximum 2D texture dimensions if texture gather operations have to be performed
594 Vec2i maxTexture2DGather() const;
596 //! maximum 3D texture dimensions
597 Vec3i maxTexture3D() const;
599 //! maximum Cubemap texture dimensions
600 int maxTextureCubemap() const;
602 //! maximum 1D layered texture dimensions
603 Vec2i maxTexture1DLayered() const;
605 //! maximum 2D layered texture dimensions
606 Vec3i maxTexture2DLayered() const;
608 //! maximum Cubemap layered texture dimensions
609 Vec2i maxTextureCubemapLayered() const;
611 //! maximum 1D surface size
612 int maxSurface1D() const;
614 //! maximum 2D surface dimensions
615 Vec2i maxSurface2D() const;
617 //! maximum 3D surface dimensions
618 Vec3i maxSurface3D() const;
620 //! maximum 1D layered surface dimensions
621 Vec2i maxSurface1DLayered() const;
623 //! maximum 2D layered surface dimensions
624 Vec3i maxSurface2DLayered() const;
626 //! maximum Cubemap surface dimensions
627 int maxSurfaceCubemap() const;
629 //! maximum Cubemap layered surface dimensions
630 Vec2i maxSurfaceCubemapLayered() const;
632 //! alignment requirements for surfaces
633 size_t surfaceAlignment() const;
635 //! device can possibly execute multiple kernels concurrently
636 bool concurrentKernels() const;
638 //! device has ECC support enabled
639 bool ECCEnabled() const;
641 //! PCI bus ID of the device
642 int pciBusID() const;
644 //! PCI device ID of the device
645 int pciDeviceID() const;
647 //! PCI domain ID of the device
648 int pciDomainID() const;
650 //! true if device is a Tesla device using TCC driver, false otherwise
651 bool tccDriver() const;
653 //! number of asynchronous engines
654 int asyncEngineCount() const;
656 //! device shares a unified address space with the host
657 bool unifiedAddressing() const;
659 //! peak memory clock frequency in kilohertz
660 int memoryClockRate() const;
662 //! global memory bus width in bits
663 int memoryBusWidth() const;
665 //! size of L2 cache in bytes
666 int l2CacheSize() const;
668 //! maximum resident threads per multiprocessor
669 int maxThreadsPerMultiProcessor() const;
671 //! gets free and total device memory
672 void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
673 size_t freeMemory() const;
674 size_t totalMemory() const;
676 //! checks whether device supports the given feature
677 bool supports(FeatureSet feature_set) const;
679 //! checks whether the CUDA module can be run on the given device
680 bool isCompatible() const;
686 CV_EXPORTS void printCudaDeviceInfo(int device);
687 CV_EXPORTS void printShortCudaDeviceInfo(int device);
689 }} // namespace cv { namespace cuda {
692 #include "opencv2/core/cuda.inl.hpp"
694 #endif /* __OPENCV_CORE_CUDA_HPP__ */