1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP
6 #define OPENCV_DNN_SRC_OP_CUDA_HPP
9 #include "cuda4dnn/csl/stream.hpp"
10 #include "cuda4dnn/csl/cublas.hpp"
11 #include "cuda4dnn/csl/cudnn.hpp"
12 #include "cuda4dnn/csl/tensor.hpp"
13 #include "cuda4dnn/csl/memory.hpp"
14 #include "cuda4dnn/csl/fp16.hpp"
15 #include "cuda4dnn/csl/workspace.hpp"
18 #include <opencv2/dnn/shape_utils.hpp>
19 #include <opencv2/core.hpp>
25 namespace cv { namespace dnn {
27 constexpr bool IS_DNN_CUDA_TARGET(int id) {
28 return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA;
31 constexpr bool haveCUDA() {
40 namespace cuda4dnn { namespace csl {
43 cublas::Handle cublas_handle;
44 cudnn::Handle cudnn_handle;
47 /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied)
49 * \tparam T element type for the tensor
50 * \param[in] mat cv::Mat from which the shape must be inferred
52 * \return a Tensor object with the shape of \p mat
55 Tensor<T> makeTensorHeader(const Mat& mat) {
56 auto sizes = shape(mat);
57 return Tensor<T>(std::begin(sizes), std::end(sizes));
60 /** @brief copies data from a cv::Mat to TensorType
62 * \tparam T the type of the elements contained in TensorType object
64 * \param[in] srcMat source matrix
65 * \param[out] destTensor destination tensor
66 * \param stream CUDA stream to use for the memory transfer
68 * The memory copy starts from begining \p srcMat. The number of elements copied is
69 * equal to the number of elements in \p destTensor.
72 * - \p srcMat must contain elements of type CV_32F
73 * - the size of \p srcMat must be larger than or equal to the size of \p destTensor
75 * @note best performance when \p srcMat is continuous and page-locked
76 * @note blocks calling thread if \p srcMat is not page-locked
79 void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream);
82 void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) {
83 /* should perhaps convert cv::Mat of different type to the required type and copy */
84 CV_Assert(srcMat.type() == CV_32F);
85 CV_Assert(srcMat.total() >= destTensor.size());
88 srcMat.convertTo(temp, CV_16F);
89 CV_Assert(temp.isContinuous());
91 memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream);
95 void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) {
96 /* should perhaps convert cv::Mat of different type to the required type and copy */
97 CV_Assert(srcMat.type() == CV_32F);
98 CV_Assert(srcMat.total() >= destTensor.size());
100 Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone();
101 CV_Assert(temp.isContinuous());
103 memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream);
106 /** @brief copies data from a TensorType to a cv::Mat
108 * \tparam T the type of the elements contained in TensorType object
110 * \param[in] srcTensor source tensor
111 * \param[out] destMat destination matrix
112 * \param stream CUDA stream to use for the memory transfer
114 * The entire memory block held by the \p srcTensor is copied to \p destMat.
117 * - \p destMat must contain elements of type CV_32F
118 * - the size of \p destMat must be larger than or equal to the size of \p srcTensor
120 * @note best performance when \p destMat is continuous and page-locked
121 * @note blocks calling thread if \p destMat is not page-locked
124 void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream);
127 void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) {
128 CV_Assert(destMat.type() == CV_32F);
129 CV_Assert(destMat.total() >= srcTensor.size());
131 Mat temp(shape(destMat), CV_16F);
132 CV_Assert(temp.isContinuous());
134 memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
136 temp.convertTo(destMat, CV_32F);
140 void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) {
141 CV_Assert(destMat.type() == CV_32F);
142 CV_Assert(destMat.total() >= srcTensor.size());
144 Mat temp = destMat.isContinuous() ? destMat : destMat.clone();
145 CV_Assert(temp.isContinuous());
147 memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
149 if (temp.data != destMat.data)
150 temp.copyTo(destMat);
153 }} /* namespace cuda4dnn::csl */
155 /** base class for CUDA operation nodes (for all supported targets) */
156 class CUDABackendNode : public BackendNode {
158 CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { }
159 virtual ~CUDABackendNode() { }
161 virtual void forward(
162 const std::vector<cv::Ptr<BackendWrapper>>& inputs,
163 const std::vector<cv::Ptr<BackendWrapper>>& outputs,
164 cuda4dnn::csl::Workspace& workspace) = 0;
166 virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; }
169 /** @brief utility function which creates CUDA node of correct type from `targetId`
171 * CUDA operation nodes take the type of data they operate on as a template parameter.
172 * For example, ConcatOp<float> is an operation node which concats tensors of `float` type
173 * into a tensor of `float` type.
175 * This utility function aids the creation of nodes of different types and eliminates the
176 * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which
177 * reduces coupling between modules.
181 * class ConcatOp : public CUDABackendNode;
183 * // returns a cv::Ptr to a ConcatOp<half> object
184 * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis);
186 * // returns a cv::Ptr to a ConcatOp<float> object
187 * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis);
189 template <template <class> class NodeType, class ...Args>
190 cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) {
193 case DNN_TARGET_CUDA_FP16:
194 return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...));
195 case DNN_TARGET_CUDA:
196 return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...));
198 CV_Assert(IS_DNN_CUDA_TARGET(targetId));
200 return Ptr<BackendNode>();
203 /* base class for all CUDA backend/target wrappers */
204 class CUDABackendWrapper : public BackendWrapper {
206 CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { }
207 virtual ~CUDABackendWrapper() { }
209 void copyToHost() override = 0;
210 void setHostDirty() override = 0;
212 virtual void copyToDevice() = 0;
213 virtual void setDeviceDirty() = 0;
215 virtual MatShape getShape() const noexcept = 0;
216 virtual std::size_t getRank() const noexcept = 0;
218 /** @note setting the stream updates the stream for all wrappers which use the same tensor */
219 virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
222 template <class T, int TargetID>
223 class GenericCUDABackendWrapper final : public CUDABackendWrapper {
225 using value_type = T;
226 using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>;
227 using tensor_view_type = cuda4dnn::csl::TensorView<value_type>;
230 * - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m`
231 * - the host memory must remain allocated throughout the lifetime of this object
234 * - the host memory used by \p m "may" be page-locked
236 GenericCUDABackendWrapper(Mat& m)
237 : CUDABackendWrapper(TargetID)
239 shape = cv::dnn::shape(m);
241 shared_block = std::make_shared<shared_block_type>();
242 shared_block->host_dirty = true;
243 shared_block->device_dirty = false;
245 shared_block->host = m;
248 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize());
250 /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
251 /* we ignore the failure as this is just an optimization and not a requirement */
254 shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total());
257 GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_)
258 : CUDABackendWrapper(TargetID)
260 const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>();
264 shared_block = base->shared_block;
267 static Ptr<BackendWrapper> create(Mat& m) {
268 return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m));
271 static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) {
272 return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape));
275 void copyToHost() override {
276 if (shared_block->device_dirty) {
277 shared_block->host_dirty = false;
278 shared_block->device_dirty = false;
280 /* If the wrapper is being reused, the device tensor might be larger in size than the wrapper.
281 * Using the device tensor does not give incorrect code but leads to unused region of memory being copied.
283 * We use a view to ensure that only the required region of memory is copied.
285 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
286 cuda4dnn::csl::copyTensorToMat<T>(view, shared_block->host, shared_block->stream);
288 shared_block->stream.synchronize();
292 void setHostDirty() override {
293 shared_block->device_dirty = false;
294 shared_block->host_dirty = true;
297 void copyToDevice() override {
298 if (shared_block->host_dirty) {
299 shared_block->host_dirty = false;
300 shared_block->device_dirty = false;
302 auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
303 cuda4dnn::csl::copyMatToTensor<T>(shared_block->host, span, shared_block->stream);
307 void setDeviceDirty() override {
308 shared_block->device_dirty = true;
309 shared_block->host_dirty = false;
312 MatShape getShape() const noexcept override { return shape; }
314 std::size_t getRank() const noexcept override { return shape.size(); }
316 void setStream(cuda4dnn::csl::Stream stream) noexcept override {
317 shared_block->stream = std::move(stream);
320 cv::Mat getMutableHostMat() noexcept {
323 return shared_block->host;
326 const cv::Mat getImmutableHostMat() const noexcept {
328 return shared_block->host;
331 /* Optimization Note: use getSpan() and getView() judiciously
333 * getSpan() is meant to be used when the memory is going to be modified
334 * getView() is meant to be used when the memory is only going to be read
336 * getSpan() marks the device memory as dirty but getView() does not
338 * getView() implicitly performs host to device memory transfer if required
339 * getSpan() does not perform any synchronization (use copyToDevice if sync. is required)
341 tensor_span_type getSpan() noexcept {
343 return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
346 tensor_view_type getView() noexcept {
348 return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
352 /* The same tensor memory can be reused by different layers whenever possible.
353 * Hence, it is possible for different backend warppers to point to the same memory.
354 * However, it may use only a part of that memory and have a different shape.
356 * We store the common information such as device tensor and its corresponding host memory in
357 * a shared block. The shared block is shared by all backend wrappers which use the same memory.
358 * The shape, which can be different for different wrappers, is stored as a member object.
363 struct shared_block_type {
368 cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */
370 cuda4dnn::csl::ManagedPtr<T> device;
371 cuda4dnn::csl::Stream stream;
374 std::shared_ptr<shared_block_type> shared_block;
377 using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>;
378 using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>;
380 template <class T> struct GetCUDABackendWrapperType_ { };
381 template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; };
382 template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; };
385 using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type;
388 }} /* namespace cv::dnn */
390 #endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */