modules/dnn/src/cuda4dnn/csl/workspace.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
   6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
   7
   8 #include "pointer.hpp"
   9 #include "span.hpp"
  10 #include "tensor.hpp"
  11
  12 #include <cstddef>
  13 #include <cstdint>
  14 #include <iterator>
  15
  16 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
  17
  18     /** @brief maintains a single block of reusable device memory
  19      *
  20      * Each Workspace object is intended to be used by a single entity at a time but by
  21      * different entities at different times. It maintains a single reusable block of memory which
  22      * is sufficient for the largest consumer.
  23      */
  24     class Workspace {
  25     public:
  26
  27         /** @brief reserve \p bytes of memory */
  28         void require(std::size_t bytes) {
  29             if (bytes > ptr.size())
  30                 ptr.reset(bytes);
  31         }
  32
  33         /** @brief number of bytes reserved by the largest consumer */
  34         std::size_t size() const noexcept {
  35             return ptr.size();
  36         }
  37
  38         /** @brief returns the pointer to the workspace memory */
  39         DevicePtr<unsigned char> get() {
  40             return ptr.get();
  41         }
  42
  43     private:
  44         ManagedPtr<unsigned char> ptr;
  45     };
  46
  47     /** used to compute total workspace size from several workspace requests */
  48     class WorkspaceBuilder {
  49     public:
  50         WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { }
  51
  52         /** request memory for \p count number of elements of the type \tparam T */
  53         template <class T = std::int8_t>
  54         void require(std::size_t count) noexcept {
  55             auto blocks256 = (count * sizeof(T) + 255) / 256;
  56             max_size_in_bytes += blocks256 * 256;
  57         }
  58
  59         /** returns the total workspace memory that is required */
  60         std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; }
  61
  62     private:
  63         std::size_t max_size_in_bytes;
  64     };
  65
  66     /** general memory block from a workspace which can be passed on to the requester */
  67     class WorkspaceInstance {
  68     public:
  69
  70         /** returns a device pointer to the workspace memory */
  71         template <class T = void>
  72         DevicePtr<T> get() const noexcept {
  73             return static_cast<DevicePtr<T>>(ptr);
  74         }
  75
  76         /** returnss the size of the workspace memory in bytes */
  77         std::size_t size_in_bytes() const noexcept {
  78             return size_in_bytes_;
  79         }
  80
  81         /** creates a Span<T> of \p count elements from the workspace memory */
  82         template <class T>
  83         Span<T> get_span(std::size_t count = 0) const {
  84             if (count == 0)
  85                 count = size_in_bytes_ / sizeof(T);
  86
  87             if (count * sizeof(T) > size_in_bytes_)
  88                 CV_Error(Error::StsNoMem, "memory not sufficient");
  89
  90             return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
  91         }
  92
  93         /** creates a TensorSpan<T> of the given shape from the workspace memory */
  94         template <class T, class ForwardItr>
  95         TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const {
  96             using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
  97             auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>());
  98             if (required_size * sizeof(T) > size_in_bytes_)
  99                 CV_Error(Error::StsNoMem, "memory not sufficient");
 100             return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end);
 101         }
 102
 103     private:
 104         DevicePtr<void> ptr;
 105         std::size_t size_in_bytes_;
 106
 107         friend class WorkspaceAllocator;
 108         WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__)
 109             : ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { }
 110     };
 111
 112     /** used to split a single workspace into constituents */
 113     class WorkspaceAllocator {
 114     public:
 115         WorkspaceAllocator() = default;
 116         WorkspaceAllocator(Workspace& workspace) noexcept
 117             : current{ workspace.get() }, bytes_remaining { workspace.size() }
 118         {
 119             CV_Assert(is_aligned<void>(current, 256));
 120             CV_Assert(bytes_remaining % 256 == 0);
 121         }
 122
 123         /** allocates a Span<T> of \p count elements from the workspace memory */
 124         template <class T>
 125         Span<T> get_span(std::size_t count = 0) {
 126             return accquire<T>(count);
 127         }
 128
 129         /** allocates a TensorSpan<T> of the given shape from the workspace memory */
 130         template <class T, class ForwardItr>
 131         TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) {
 132             using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
 133             auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
 134             return TensorSpan<T>(accquire<T>(required_size).data(), start, end);
 135         }
 136
 137         /** allocates a WorkspaceInstance of size \p bytes from the workspace memory */
 138         WorkspaceInstance get_instance(std::size_t bytes = 0) {
 139             auto span = accquire(bytes);
 140             return WorkspaceInstance(DevicePtr<void>(span.data()), span.size());
 141         }
 142
 143     private:
 144         template <class T = std::int8_t>
 145         Span<T> accquire(std::size_t count = 0) {
 146             auto ptr = current;
 147
 148             if (count == 0)
 149                 count = bytes_remaining / sizeof(T);
 150
 151             auto blocks256 = (count * sizeof(T) + 255) / 256;
 152             if (bytes_remaining < blocks256 * 256)
 153                 CV_Error(Error::StsNoMem, "out of workspace memory");
 154
 155             bytes_remaining -= blocks256 * 256;
 156             current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256;
 157             return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
 158         }
 159
 160         DevicePtr<void> current;
 161         std::size_t bytes_remaining;
 162     };
 163
 164 }}}} /* namespace cv::dnn::cuda4dnn::csl */
 165
 166 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */