modules/dnn/src/cuda4dnn/csl/memory.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
   6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
   7
   8 #include "error.hpp"
   9 #include "pointer.hpp"
  10
  11 #include <opencv2/core.hpp>
  12
  13 #include <cuda_runtime_api.h>
  14
  15 #include <cstddef>
  16 #include <type_traits>
  17 #include <memory>
  18 #include <utility>
  19
  20 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
  21
  22     /* @brief smart device pointer with allocation/deallocation methods
  23      *
  24      * ManagedPtr is a smart shared device pointer which also handles memory allocation.
  25      */
  26     template <class T>
  27     class ManagedPtr {
  28         static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
  29         static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
  30
  31     public:
  32         using element_type = T;
  33
  34         using pointer = DevicePtr<element_type>;
  35         using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
  36
  37         using size_type = std::size_t;
  38
  39         ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
  40         ManagedPtr(const ManagedPtr&) noexcept = default;
  41         ManagedPtr(ManagedPtr&& other) noexcept
  42             : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
  43         {
  44             other.reset();
  45         }
  46
  47         /** allocates device memory for \p count number of element */
  48         ManagedPtr(size_type count) {
  49             if (count <= 0) {
  50                 CV_Error(Error::StsBadArg, "number of elements is zero or negative");
  51             }
  52
  53             void* temp = nullptr;
  54             CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
  55
  56             auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
  57             wrapped.reset(ptr, [](element_type* ptr) {
  58                 if (ptr != nullptr) {
  59                     /* contract violation for std::shared_ptr if cudaFree throws */
  60                     try {
  61                         CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
  62                     } catch (const CUDAException& ex) {
  63                         std::ostringstream os;
  64                         os << "Device memory deallocation failed in deleter.\n";
  65                         os << ex.what();
  66                         os << "Exception will be ignored.\n";
  67                         CV_LOG_WARNING(0, os.str().c_str());
  68                     }
  69                 }
  70             });
  71             /* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
  72              * need to have a try-catch block to free the allocated device memory
  73              */
  74
  75             n = capacity = count;
  76         }
  77
  78         ManagedPtr& operator=(ManagedPtr&& other) noexcept {
  79             wrapped = std::move(other.wrapped);
  80             n = other.n;
  81             capacity = other.capacity;
  82
  83             other.reset();
  84             return *this;
  85         }
  86
  87         size_type size() const noexcept { return n; }
  88
  89         void reset() noexcept { wrapped.reset(); n = capacity = 0; }
  90
  91         /**
  92          * deallocates any previously allocated memory and allocates device memory
  93          * for \p count number of elements
  94          *
  95          * @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
  96          * @note use move constructor to guarantee a deallocation of the previously allocated memory
  97          *
  98          * Exception Guarantee: Strong
  99          */
 100         void reset(size_type count) {
 101             /* we need to fully own the memory to perform optimizations */
 102             if (wrapped.use_count() == 1) {
 103                 /* avoid reallocation if the existing capacity is sufficient */
 104                 if (count <= capacity) {
 105                     n = count;
 106                     return;
 107                 }
 108             }
 109
 110             /* no optimization performed; allocate memory */
 111             ManagedPtr tmp(count);
 112             swap(tmp, *this);
 113         }
 114
 115         pointer get() const noexcept { return pointer(wrapped.get()); }
 116
 117         explicit operator bool() const noexcept { return wrapped; }
 118
 119         friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
 120         friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
 121
 122         friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
 123             using std::swap;
 124             swap(lhs.wrapped, rhs.wrapped);
 125             swap(lhs.n, rhs.n);
 126             swap(lhs.capacity, rhs.capacity);
 127         }
 128
 129     private:
 130         std::shared_ptr<element_type> wrapped;
 131         size_type n, capacity;
 132     };
 133
 134     /** copies entire memory block pointed by \p src to \p dest
 135      *
 136      * \param[in]   src     device pointer
 137      * \param[out]  dest    host pointer
 138      *
 139      * Pre-conditions:
 140      * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
 141      *
 142      * Exception Guarantee: Basic
 143      */
 144     template <class T>
 145     void memcpy(T *dest, const ManagedPtr<T>& src) {
 146         memcpy<T>(dest, src.get(), src.size());
 147     }
 148
 149     /** copies data from memory pointed by \p src to fully fill \p dest
 150      *
 151      * \param[in]   src     host pointer
 152      * \param[out]  dest    device pointer
 153      *
 154      * Pre-conditions:
 155      * - memory pointed by \p src must be at least as big as the memory block held by \p dest
 156      *
 157      * Exception Guarantee: Basic
 158      */
 159     template <class T>
 160     void memcpy(const ManagedPtr<T>& dest, const T* src) {
 161         memcpy<T>(dest.get(), src, dest.size());
 162     }
 163
 164     /** copies data from memory pointed by \p src to \p dest
 165      *
 166      * if the two \p src and \p  dest have different sizes, the number of elements copied is
 167      * equal to the size of the smaller memory block
 168      *
 169      * \param[in]   src     device pointer
 170      * \param[out]  dest    device pointer
 171      *
 172      * Exception Guarantee: Basic
 173      */
 174     template <class T>
 175     void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
 176         memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
 177     }
 178
 179     /** sets device memory block to a specific 8-bit value
 180      *
 181      * \param[in]   src     device pointer
 182      * \param[out]  ch      8-bit value to fill the device memory with
 183      *
 184      * Exception Guarantee: Basic
 185      */
 186     template <class T>
 187     void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
 188         memset<T>(dest.get(), ch, dest.size());
 189     }
 190
 191     /** copies entire memory block pointed by \p src to \p dest asynchronously
 192      *
 193      * \param[in]   src     device pointer
 194      * \param[out]  dest    host pointer
 195      * \param       stream  CUDA stream that has to be used for the memory transfer
 196      *
 197      * Pre-conditions:
 198      * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
 199      * - \p dest points to page-locked memory
 200      *
 201      * Exception Guarantee: Basic
 202      */
 203     template <class T>
 204     void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
 205         CV_Assert(stream);
 206         memcpy<T>(dest, src.get(), src.size(), stream);
 207     }
 208
 209     /** copies data from memory pointed by \p src to \p dest asynchronously
 210      *
 211      * \param[in]   src     host pointer
 212      * \param[out]  dest    device pointer
 213      * \param       stream  CUDA stream that has to be used for the memory transfer
 214      *
 215      * Pre-conditions:
 216      * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
 217      * - \p src points to page-locked memory
 218      *
 219      * Exception Guarantee: Basic
 220      */
 221     template <class T>
 222     void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
 223         CV_Assert(stream);
 224         memcpy<T>(dest.get(), src, dest.size(), stream);
 225     }
 226
 227     /** copies data from memory pointed by \p src to \p dest asynchronously
 228      *
 229      * \param[in]   src     device pointer
 230      * \param[out]  dest    device pointer
 231      * \param       stream  CUDA stream that has to be used for the memory transfer
 232      *
 233      * if the two \p src and \p  dest have different sizes, the number of elements copied is
 234      * equal to the size of the smaller memory block
 235      *
 236      * Exception Guarantee: Basic
 237      */
 238     template <class T>
 239     void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
 240         CV_Assert(stream);
 241         memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
 242     }
 243
 244     /** sets device memory block to a specific 8-bit value asynchronously
 245      *
 246      * \param[in]   src     device pointer
 247      * \param[out]  ch      8-bit value to fill the device memory with
 248      * \param       stream  CUDA stream that has to be used for the memory operation
 249      *
 250      * Exception Guarantee: Basic
 251      */
 252     template <class T>
 253     void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
 254         CV_Assert(stream);
 255         memset<T>(dest.get(), ch, dest.size(), stream);
 256     }
 257
 258     /** @brief registers host memory as page-locked and unregisters on destruction */
 259     class MemoryLockGuard {
 260     public:
 261         MemoryLockGuard() noexcept : ptr { nullptr } { }
 262         MemoryLockGuard(const MemoryLockGuard&) = delete;
 263         MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
 264             other.ptr = nullptr;
 265         }
 266
 267         /** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
 268          *
 269          * Pre-conditons:
 270          * - host memory should be unregistered
 271          */
 272         MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
 273             CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
 274             ptr = ptr_;
 275         }
 276
 277         MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
 278         MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
 279             ptr = other.ptr;
 280             other.ptr = nullptr;
 281             return *this;
 282         }
 283
 284         ~MemoryLockGuard() {
 285             if(ptr != nullptr)
 286                 CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
 287         }
 288
 289     private:
 290         void *ptr;
 291     };
 292
 293 }}}} /* namespace cv::dnn::cuda4dnn::csl */
 294
 295 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */