1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
10 #include "../pointer.hpp"
11 #include "../workspace.hpp"
19 #include <type_traits>
22 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
24 /** describe convolution filters
26 * @tparam T type of elements in the kernels
29 class FilterDescriptor {
31 FilterDescriptor() noexcept : descriptor{ nullptr } { }
32 FilterDescriptor(const FilterDescriptor&) = delete;
33 FilterDescriptor(FilterDescriptor&& other) noexcept
34 : descriptor{ other.descriptor } {
35 other.descriptor = nullptr;
38 /** constructs a filter descriptor from the filter dimensions provided in \p shape
41 * 0: number of filters
42 * 1: number of input feature maps
43 * 2..n: kernel dimensions
45 * Exception Guarantee: Strong
47 template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
48 FilterDescriptor(const SequenceContainer& shape) {
49 constructor(shape.begin(), shape.end());
52 /** constructs a filter descriptor from the filter dimensions provided in [begin, end)
55 * 0: number of filters
56 * 1: number of input feature maps
57 * 2..n: kernel dimensions
59 * Exception Guarantee: Strong
61 template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
62 FilterDescriptor(ForwardItr begin, ForwardItr end) {
63 constructor(begin, end);
66 /** constructs a filter descriptor from the filter dimensions provided as arguments
69 * 0: number of filters
70 * 1: number of input feature maps
71 * 2..n: kernel dimensions
73 * Exception Guarantee: Strong
75 template <class ...Sizes>
76 FilterDescriptor(Sizes ...sizes) {
77 static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
78 static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
79 std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
80 constructor(std::begin(dims), std::end(dims));
83 ~FilterDescriptor() noexcept {
84 if (descriptor != nullptr) {
85 /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
86 CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
90 FilterDescriptor& operator=(const FilterDescriptor&) = delete;
91 FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
92 descriptor = other.descriptor;
93 other.descriptor = nullptr;
97 cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
100 template <class ForwardItr>
101 void constructor(ForwardItr start, ForwardItr end) {
102 CV_Assert(start != end);
103 CV_Assert(std::distance(start, end) >= 3);
104 CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
106 CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
108 const auto rank = std::distance(start, end);
110 std::array<int, 4> dims;
111 std::copy(start, end, std::begin(dims));
112 CUDA4DNN_CHECK_CUDNN(
113 cudnnSetFilter4dDescriptor(
115 detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
116 dims[0], dims[1], dims[2], dims[3]
120 std::vector<int> dims(start, end);
121 CUDA4DNN_CHECK_CUDNN(
122 cudnnSetFilterNdDescriptor(
124 detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
125 dims.size(), dims.data()
130 /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
131 CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
136 cudnnFilterDescriptor_t descriptor;
139 /** describes a convolution operation
141 * @tparam T type of element participating in convolution
144 class ConvolutionDescriptor {
146 ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
147 ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
148 ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
149 : descriptor{ other.descriptor } {
150 other.descriptor = nullptr;
153 /** constructs a convolution descriptor
156 * - \p zero_padding, \p stride and \p dilation must have the same size
158 * The length of the containers is interpreted as the order of the convolution.
160 * Exception Guarantee: Strong
162 template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
163 ConvolutionDescriptor(
164 const SequenceContainer& zero_padding,
165 const SequenceContainer& stride,
166 const SequenceContainer& dilation,
167 std::size_t group_count)
169 constructor(zero_padding, stride, dilation, group_count);
172 ~ConvolutionDescriptor() noexcept {
173 if (descriptor != nullptr) {
174 /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
175 CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
179 ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
180 ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
181 descriptor = other.descriptor;
182 other.descriptor = nullptr;
186 cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
189 template <class SequenceContainer>
191 const SequenceContainer& zero_padding,
192 const SequenceContainer& stride,
193 const SequenceContainer& dilation,
194 std::size_t group_count)
196 CV_Assert(zero_padding.size() == stride.size());
197 CV_Assert(zero_padding.size() == dilation.size());
199 CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
201 const auto rank = zero_padding.size();
203 CUDA4DNN_CHECK_CUDNN(
204 cudnnSetConvolution2dDescriptor(
206 zero_padding[0], zero_padding[1],
207 stride[0], stride[1],
208 dilation[0], dilation[1],
209 CUDNN_CROSS_CORRELATION,
210 detail::get_data_type<T>()
214 std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
215 std::vector<int> istride(std::begin(stride), std::end(stride));
216 std::vector<int> idilation(std::begin(dilation), std::end(dilation));
217 CUDA4DNN_CHECK_CUDNN(
218 cudnnSetConvolutionNdDescriptor(
220 rank, ipadding.data(), istride.data(), idilation.data(),
221 CUDNN_CROSS_CORRELATION,
222 detail::get_data_type<T>()
226 CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
228 /* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */
229 CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
234 cudnnConvolutionDescriptor_t descriptor;
237 /** wrapper around a convolution algorithm
239 * @tparam T type of elements being convolved
242 class ConvolutionAlgorithm {
244 ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
245 ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
246 ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
248 /** selects a good algorithm for convolution for given configuration
250 * Exception Guarantee: Strong
252 ConvolutionAlgorithm(
253 const Handle& handle,
254 const ConvolutionDescriptor<T>& conv,
255 const FilterDescriptor<T>& filter,
256 const TensorDescriptor<T>& input,
257 const TensorDescriptor<T>& output)
259 CUDA4DNN_CHECK_CUDNN(
260 cudnnGetConvolutionForwardAlgorithm(
262 input.get(), filter.get(), conv.get(), output.get(),
263 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
264 0, /* no memory limit */
269 CUDA4DNN_CHECK_CUDNN(
270 cudnnGetConvolutionForwardWorkspaceSize(
272 input.get(), filter.get(), conv.get(), output.get(),
273 algo, &workspace_size
278 ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
279 ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
281 cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
283 /** number of bytes of workspace memory required by the algorithm */
284 std::size_t get_workspace_size() const noexcept { return workspace_size; }
287 cudnnConvolutionFwdAlgo_t algo;
288 std::size_t workspace_size;
291 /** gives the shape of the output tensor of convolution
293 * Exception Guarantee: Basic
296 void getConvolutionForwardOutputDim(
297 const ConvolutionDescriptor<T>& convDesc,
298 const FilterDescriptor<T>& filterDesc,
299 const TensorDescriptor<T>& inputDesc,
300 std::vector<int>& output)
303 output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
305 std::vector<int> temp(CUDNN_DIM_MAX);
306 cudnnDataType_t tempDataType;
307 CUDA4DNN_CHECK_CUDNN(
308 cudnnGetTensorNdDescriptor(
310 CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
318 const auto rank = output[0];
320 CUDA4DNN_CHECK_CUDNN(
321 cudnnGetConvolutionNdForwardOutputDim(
322 convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
327 /** @brief performs convolution
329 * dstValue = alpha * result + beta * priorDstValue
331 * @tparam T convolution element type (must be `half` or `float`)
333 * @param handle valid cuDNN Handle
334 * @param convDesc convolution description
335 * @param convAlgo algorithm to use for convolution
336 * @param workspace workspace memory which meets the requirements of \p convAlgo
337 * @param filterDesc filter descriptor
338 * @param[in] filterPtr pointer to device memory containing the filters
339 * @param inputDesc tensor descriptor describing the input
340 * @param[in] inputPtr pointer to input tensor in device memory
341 * @param alpha result scale factor
342 * @param beta previous value scale factor
343 * @param outputDesc tensor descriptor describing the output
344 * @param[out] outputPtr pointer to output tensor in device memory
346 * Exception Guarantee: Basic
350 const Handle& handle,
351 const ConvolutionDescriptor<T>& convDesc,
352 const ConvolutionAlgorithm<T>& convAlgo,
353 WorkspaceInstance workspace,
354 const FilterDescriptor<T>& filterDesc,
355 DevicePtr<const T> filterPtr,
356 const TensorDescriptor<T>& inputDesc,
357 DevicePtr<const T> inputPtr,
359 const TensorDescriptor<T>& outputDesc,
360 DevicePtr<T> outputPtr)
364 CUDA4DNN_CHECK_CUDNN(
365 cudnnConvolutionForward(
367 &alpha, inputDesc.get(), inputPtr.get(),
368 filterDesc.get(), filterPtr.get(),
369 convDesc.get(), convAlgo.get(),
370 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
371 &beta, outputDesc.get(), outputPtr.get()
378 const Handle& handle,
379 const ConvolutionDescriptor<half>& convDesc,
380 const ConvolutionAlgorithm<half>& convAlgo,
381 WorkspaceInstance workspace,
382 const FilterDescriptor<half>& filterDesc,
383 DevicePtr<const half> filterPtr,
384 const TensorDescriptor<half>& inputDesc,
385 DevicePtr<const half> inputPtr,
386 half alpha, half beta,
387 const TensorDescriptor<half>& outputDesc,
388 DevicePtr<half> outputPtr)
392 /* we specalize for fp16 as the scaling factors must be provided as `float` */
393 float alpha_ = alpha, beta_ = beta;
394 CUDA4DNN_CHECK_CUDNN(
395 cudnnConvolutionForward(
397 &alpha_, inputDesc.get(), inputPtr.get(),
398 filterDesc.get(), filterPtr.get(),
399 convDesc.get(), convAlgo.get(),
400 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
401 &beta_, outputDesc.get(), outputPtr.get()
406 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
408 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */