1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
11 #include <opencv2/core.hpp>
13 #include <cuda_runtime_api.h>
16 #include <type_traits>
20 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
22 /* @brief smart device pointer with allocation/deallocation methods
24 * ManagedPtr is a smart shared device pointer which also handles memory allocation.
28 static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
29 static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
32 using element_type = T;
34 using pointer = DevicePtr<element_type>;
35 using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
37 using size_type = std::size_t;
39 ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
40 ManagedPtr(const ManagedPtr&) noexcept = default;
41 ManagedPtr(ManagedPtr&& other) noexcept
42 : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
47 /** allocates device memory for \p count number of element */
48 ManagedPtr(size_type count) {
50 CV_Error(Error::StsBadArg, "number of elements is zero or negative");
54 CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
56 auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
57 wrapped.reset(ptr, [](element_type* ptr) {
59 /* contract violation for std::shared_ptr if cudaFree throws */
61 CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
62 } catch (const CUDAException& ex) {
63 std::ostringstream os;
64 os << "Device memory deallocation failed in deleter.\n";
66 os << "Exception will be ignored.\n";
67 CV_LOG_WARNING(0, os.str().c_str());
71 /* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
72 * need to have a try-catch block to free the allocated device memory
78 ManagedPtr& operator=(ManagedPtr&& other) noexcept {
79 wrapped = std::move(other.wrapped);
81 capacity = other.capacity;
87 size_type size() const noexcept { return n; }
89 void reset() noexcept { wrapped.reset(); n = capacity = 0; }
92 * deallocates any previously allocated memory and allocates device memory
93 * for \p count number of elements
95 * @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
96 * @note use move constructor to guarantee a deallocation of the previously allocated memory
98 * Exception Guarantee: Strong
100 void reset(size_type count) {
101 /* we need to fully own the memory to perform optimizations */
102 if (wrapped.use_count() == 1) {
103 /* avoid reallocation if the existing capacity is sufficient */
104 if (count <= capacity) {
110 /* no optimization performed; allocate memory */
111 ManagedPtr tmp(count);
115 pointer get() const noexcept { return pointer(wrapped.get()); }
117 explicit operator bool() const noexcept { return wrapped; }
119 friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
120 friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
122 friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
124 swap(lhs.wrapped, rhs.wrapped);
126 swap(lhs.capacity, rhs.capacity);
130 std::shared_ptr<element_type> wrapped;
131 size_type n, capacity;
134 /** copies entire memory block pointed by \p src to \p dest
136 * \param[in] src device pointer
137 * \param[out] dest host pointer
140 * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
142 * Exception Guarantee: Basic
145 void memcpy(T *dest, const ManagedPtr<T>& src) {
146 memcpy<T>(dest, src.get(), src.size());
149 /** copies data from memory pointed by \p src to fully fill \p dest
151 * \param[in] src host pointer
152 * \param[out] dest device pointer
155 * - memory pointed by \p src must be at least as big as the memory block held by \p dest
157 * Exception Guarantee: Basic
160 void memcpy(const ManagedPtr<T>& dest, const T* src) {
161 memcpy<T>(dest.get(), src, dest.size());
164 /** copies data from memory pointed by \p src to \p dest
166 * if the two \p src and \p dest have different sizes, the number of elements copied is
167 * equal to the size of the smaller memory block
169 * \param[in] src device pointer
170 * \param[out] dest device pointer
172 * Exception Guarantee: Basic
175 void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
176 memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
179 /** sets device memory block to a specific 8-bit value
181 * \param[in] src device pointer
182 * \param[out] ch 8-bit value to fill the device memory with
184 * Exception Guarantee: Basic
187 void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
188 memset<T>(dest.get(), ch, dest.size());
191 /** copies entire memory block pointed by \p src to \p dest asynchronously
193 * \param[in] src device pointer
194 * \param[out] dest host pointer
195 * \param stream CUDA stream that has to be used for the memory transfer
198 * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
199 * - \p dest points to page-locked memory
201 * Exception Guarantee: Basic
204 void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
206 memcpy<T>(dest, src.get(), src.size(), stream);
209 /** copies data from memory pointed by \p src to \p dest asynchronously
211 * \param[in] src host pointer
212 * \param[out] dest device pointer
213 * \param stream CUDA stream that has to be used for the memory transfer
216 * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
217 * - \p src points to page-locked memory
219 * Exception Guarantee: Basic
222 void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
224 memcpy<T>(dest.get(), src, dest.size(), stream);
227 /** copies data from memory pointed by \p src to \p dest asynchronously
229 * \param[in] src device pointer
230 * \param[out] dest device pointer
231 * \param stream CUDA stream that has to be used for the memory transfer
233 * if the two \p src and \p dest have different sizes, the number of elements copied is
234 * equal to the size of the smaller memory block
236 * Exception Guarantee: Basic
239 void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
241 memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
244 /** sets device memory block to a specific 8-bit value asynchronously
246 * \param[in] src device pointer
247 * \param[out] ch 8-bit value to fill the device memory with
248 * \param stream CUDA stream that has to be used for the memory operation
250 * Exception Guarantee: Basic
253 void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
255 memset<T>(dest.get(), ch, dest.size(), stream);
258 /** @brief registers host memory as page-locked and unregisters on destruction */
259 class MemoryLockGuard {
261 MemoryLockGuard() noexcept : ptr { nullptr } { }
262 MemoryLockGuard(const MemoryLockGuard&) = delete;
263 MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
267 /** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
270 * - host memory should be unregistered
272 MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
273 CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
277 MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
278 MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
286 CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
293 }}}} /* namespace cv::dnn::cuda4dnn::csl */
295 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */