modules/dnn/src/op_cuda.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP
   6 #define OPENCV_DNN_SRC_OP_CUDA_HPP
   7
   8 #ifdef HAVE_CUDA
   9 #include "cuda4dnn/csl/stream.hpp"
  10 #include "cuda4dnn/csl/cublas.hpp"
  11 #include "cuda4dnn/csl/cudnn.hpp"
  12 #include "cuda4dnn/csl/tensor.hpp"
  13 #include "cuda4dnn/csl/memory.hpp"
  14 #include "cuda4dnn/csl/fp16.hpp"
  15 #include "cuda4dnn/csl/workspace.hpp"
  16 #endif
  17
  18 #include <opencv2/dnn/shape_utils.hpp>
  19 #include <opencv2/core.hpp>
  20
  21 #include <cstddef>
  22 #include <memory>
  23 #include <iterator>
  24
  25 namespace cv { namespace dnn {
  26
  27     constexpr bool IS_DNN_CUDA_TARGET(int id) {
  28         return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA;
  29     }
  30
  31     constexpr bool haveCUDA() {
  32 #ifdef HAVE_CUDA
  33         return true;
  34 #else
  35         return false;
  36 #endif
  37     }
  38
  39 #ifdef HAVE_CUDA
  40     namespace cuda4dnn { namespace csl {
  41         struct CSLContext {
  42             Stream stream;
  43             cublas::Handle cublas_handle;
  44             cudnn::Handle cudnn_handle;
  45         };
  46
  47         /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied)
  48          *
  49          * \tparam      T   element type for the tensor
  50          * \param[in]   mat cv::Mat from which the shape must be inferred
  51          *
  52          * \return a Tensor object with the shape of \p mat
  53          */
  54         template <class T>
  55         Tensor<T> makeTensorHeader(const Mat& mat) {
  56             auto sizes = shape(mat);
  57             return Tensor<T>(std::begin(sizes), std::end(sizes));
  58         }
  59
  60         /** @brief copies data from a cv::Mat to TensorType
  61          *
  62          * \tparam  T   the type of the elements contained in TensorType object
  63          *
  64          * \param[in]   srcMat      source matrix
  65          * \param[out]  destTensor  destination tensor
  66          * \param       stream      CUDA stream to use for the memory transfer
  67          *
  68          * The memory copy starts from begining \p srcMat. The number of elements copied is
  69          * equal to the number of elements in \p destTensor.
  70          *
  71          * Pre-conditions:
  72          * - \p srcMat must contain elements of type CV_32F
  73          * - the size of \p srcMat must be larger than or equal to the size of \p destTensor
  74          *
  75          * @note best performance when \p srcMat is continuous and page-locked
  76          * @note blocks calling thread if \p srcMat is not page-locked
  77          */
  78         template <class T>
  79         void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream);
  80
  81         template <> inline
  82         void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) {
  83             /* should perhaps convert cv::Mat of different type to the required type and copy */
  84             CV_Assert(srcMat.type() == CV_32F);
  85             CV_Assert(srcMat.total() >= destTensor.size());
  86
  87             Mat temp;
  88             srcMat.convertTo(temp, CV_16F);
  89             CV_Assert(temp.isContinuous());
  90
  91             memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream);
  92         }
  93
  94         template <> inline
  95         void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) {
  96             /* should perhaps convert cv::Mat of different type to the required type and copy */
  97             CV_Assert(srcMat.type() == CV_32F);
  98             CV_Assert(srcMat.total() >= destTensor.size());
  99
 100             Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone();
 101             CV_Assert(temp.isContinuous());
 102
 103             memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream);
 104         }
 105
 106         /** @brief copies data from a TensorType to a cv::Mat
 107          *
 108          * \tparam  T   the type of the elements contained in TensorType object
 109          *
 110          * \param[in]   srcTensor   source tensor
 111          * \param[out]  destMat     destination matrix
 112          * \param       stream      CUDA stream to use for the memory transfer
 113          *
 114          * The entire memory block held by the \p srcTensor is copied to \p destMat.
 115          *
 116          * Pre-conditions:
 117          * - \p destMat must contain elements of type CV_32F
 118          * - the size of \p destMat must be larger than or equal to the size of \p srcTensor
 119          *
 120          * @note best performance when \p destMat is continuous and page-locked
 121          * @note blocks calling thread if \p destMat is not page-locked
 122          */
 123         template <class T>
 124         void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream);
 125
 126         template <> inline
 127         void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) {
 128             CV_Assert(destMat.type() == CV_32F);
 129             CV_Assert(destMat.total() >= srcTensor.size());
 130
 131             Mat temp(shape(destMat), CV_16F);
 132             CV_Assert(temp.isContinuous());
 133
 134             memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
 135
 136             temp.convertTo(destMat, CV_32F);
 137         }
 138
 139         template <> inline
 140         void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) {
 141             CV_Assert(destMat.type() == CV_32F);
 142             CV_Assert(destMat.total() >= srcTensor.size());
 143
 144             Mat temp = destMat.isContinuous() ? destMat : destMat.clone();
 145             CV_Assert(temp.isContinuous());
 146
 147             memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
 148
 149             if (temp.data != destMat.data)
 150                 temp.copyTo(destMat);
 151         }
 152
 153     }} /* namespace cuda4dnn::csl */
 154
 155     /** base class for CUDA operation nodes (for all supported targets) */
 156     class CUDABackendNode : public BackendNode {
 157     public:
 158         CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { }
 159         virtual ~CUDABackendNode() { }
 160
 161         virtual void forward(
 162             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
 163             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
 164             cuda4dnn::csl::Workspace& workspace) = 0;
 165
 166         virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; }
 167     };
 168
 169     /** @brief utility function which creates CUDA node of correct type from `targetId`
 170      *
 171      * CUDA operation nodes take the type of data they operate on as a template parameter.
 172      * For example, ConcatOp<float> is an operation node which concats tensors of `float` type
 173      * into a tensor of `float` type.
 174      *
 175      * This utility function aids the creation of nodes of different types and eliminates the
 176      * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which
 177      * reduces coupling between modules.
 178      *
 179      * Example:
 180      * template <class T>
 181      * class ConcatOp : public CUDABackendNode;
 182      *
 183      * // returns a cv::Ptr to a ConcatOp<half> object
 184      * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis);
 185      *
 186      * // returns a cv::Ptr to a ConcatOp<float> object
 187      * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis);
 188      */
 189     template <template <class> class NodeType, class ...Args>
 190     cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) {
 191         switch (targetId)
 192         {
 193         case DNN_TARGET_CUDA_FP16:
 194             return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...));
 195         case DNN_TARGET_CUDA:
 196             return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...));
 197         default:
 198             CV_Assert(IS_DNN_CUDA_TARGET(targetId));
 199         }
 200         return Ptr<BackendNode>();
 201     }
 202
 203     /* base class for all CUDA backend/target wrappers */
 204     class CUDABackendWrapper : public BackendWrapper {
 205     public:
 206         CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { }
 207         virtual ~CUDABackendWrapper() { }
 208
 209         void copyToHost() override = 0;
 210         void setHostDirty() override = 0;
 211
 212         virtual void copyToDevice() = 0;
 213         virtual void setDeviceDirty() = 0;
 214
 215         virtual MatShape getShape() const noexcept = 0;
 216         virtual std::size_t getRank() const noexcept = 0;
 217
 218         /** @note setting the stream updates the stream for all wrappers which use the same tensor */
 219         virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
 220     };
 221
 222     template <class T, int TargetID>
 223     class GenericCUDABackendWrapper final : public CUDABackendWrapper {
 224     public:
 225         using value_type = T;
 226         using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>;
 227         using tensor_view_type = cuda4dnn::csl::TensorView<value_type>;
 228
 229         /* Pre-conditions:
 230          * - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m`
 231          * - the host memory must remain allocated throughout the lifetime of this object
 232          *
 233          * Post-conditions:
 234          * - the host memory used by \p m "may" be page-locked
 235          */
 236         GenericCUDABackendWrapper(Mat& m)
 237             : CUDABackendWrapper(TargetID)
 238         {
 239             shape = cv::dnn::shape(m);
 240
 241             shared_block = std::make_shared<shared_block_type>();
 242             shared_block->host_dirty = true;
 243             shared_block->device_dirty = false;
 244
 245             shared_block->host = m;
 246
 247             try {
 248                 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize());
 249             } catch (...) {
 250                 /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
 251                 /* we ignore the failure as this is just an optimization and not a requirement */
 252             }
 253
 254             shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total());
 255         }
 256
 257         GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_)
 258             : CUDABackendWrapper(TargetID)
 259         {
 260             const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>();
 261             CV_Assert(base);
 262
 263             shape = shape_;
 264             shared_block = base->shared_block;
 265         }
 266
 267         static Ptr<BackendWrapper> create(Mat& m) {
 268             return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m));
 269         }
 270
 271         static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) {
 272             return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape));
 273         }
 274
 275         void copyToHost() override {
 276             if (shared_block->device_dirty) {
 277                 shared_block->host_dirty = false;
 278                 shared_block->device_dirty = false;
 279
 280                 /* If the wrapper is being reused, the device tensor might be larger in size than the wrapper.
 281                  * Using the device tensor does not give incorrect code but leads to unused region of memory being copied.
 282                  *
 283                  * We use a view to ensure that only the required region of memory is copied.
 284                  */
 285                 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
 286                 cuda4dnn::csl::copyTensorToMat<T>(view, shared_block->host, shared_block->stream);
 287
 288                 shared_block->stream.synchronize();
 289             }
 290         }
 291
 292         void setHostDirty() override {
 293             shared_block->device_dirty = false;
 294             shared_block->host_dirty = true;
 295         }
 296
 297         void copyToDevice() override {
 298             if (shared_block->host_dirty) {
 299                 shared_block->host_dirty = false;
 300                 shared_block->device_dirty = false;
 301
 302                 auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
 303                 cuda4dnn::csl::copyMatToTensor<T>(shared_block->host, span, shared_block->stream);
 304             }
 305         }
 306
 307         void setDeviceDirty() override {
 308             shared_block->device_dirty = true;
 309             shared_block->host_dirty = false;
 310         }
 311
 312         MatShape getShape() const noexcept override { return shape; }
 313
 314         std::size_t getRank() const noexcept override { return shape.size(); }
 315
 316         void setStream(cuda4dnn::csl::Stream stream) noexcept override {
 317             shared_block->stream = std::move(stream);
 318         }
 319
 320         cv::Mat getMutableHostMat() noexcept {
 321             copyToHost();
 322             setHostDirty();
 323             return shared_block->host;
 324         }
 325
 326         const cv::Mat getImmutableHostMat() const noexcept {
 327             copyToHost();
 328             return shared_block->host;
 329         }
 330
 331         /* Optimization Note: use getSpan() and getView() judiciously
 332          *
 333          * getSpan() is meant to be used when the memory is going to be modified
 334          * getView() is meant to be used when the memory is only going to be read
 335          *
 336          * getSpan() marks the device memory as dirty but getView() does not
 337          *
 338          * getView() implicitly performs host to device memory transfer if required
 339          * getSpan() does not perform any synchronization (use copyToDevice if sync. is required)
 340          */
 341         tensor_span_type getSpan() noexcept {
 342             setDeviceDirty();
 343             return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
 344         }
 345
 346         tensor_view_type getView() noexcept {
 347             copyToDevice();
 348             return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
 349         }
 350
 351     private:
 352         /* The same tensor memory can be reused by different layers whenever possible.
 353          * Hence, it is possible for different backend warppers to point to the same memory.
 354          * However, it may use only a part of that memory and have a different shape.
 355          *
 356          * We store the common information such as device tensor and its corresponding host memory in
 357          * a shared block. The shared block is shared by all backend wrappers which use the same memory.
 358          * The shape, which can be different for different wrappers, is stored as a member object.
 359          */
 360
 361         MatShape shape;
 362
 363         struct shared_block_type {
 364             bool host_dirty;
 365             bool device_dirty;
 366
 367             cv::Mat host;
 368             cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */
 369
 370             cuda4dnn::csl::ManagedPtr<T> device;
 371             cuda4dnn::csl::Stream stream;
 372         };
 373
 374         std::shared_ptr<shared_block_type> shared_block;
 375     };
 376
 377     using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>;
 378     using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>;
 379
 380     template <class T> struct GetCUDABackendWrapperType_ { };
 381     template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; };
 382     template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; };
 383
 384     template <class T>
 385     using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type;
 386
 387 #endif
 388 }} /* namespace cv::dnn */
 389
 390 #endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */