1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
8 #include "../../op_cuda.hpp"
10 #include "../csl/cudnn.hpp"
11 #include "../csl/stream.hpp"
12 #include "../csl/tensor.hpp"
13 #include "../csl/tensor_ops.hpp"
14 #include "../kernels/scale_shift.hpp"
16 #include <opencv2/core.hpp>
24 namespace cv { namespace dnn { namespace cuda4dnn {
26 struct ConvolutionConfiguration {
27 /* the size of the following vectors must be equal to the kernel size */
28 std::vector<std::size_t> kernel_size;
29 std::vector<std::size_t> dilations, strides;
31 enum class PaddingMode {
32 MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
33 VALID, /* no padding is added */
34 SAME /* TensorFlow logic is used for same padding */
37 /* explicit paddings are used if and only if padMode is set to manual */
39 std::vector<std::size_t> pads_begin, pads_end;
41 /* full shape inclusive of channel and batch axis */
42 std::vector<std::size_t> input_shape;
43 std::vector<std::size_t> output_shape;
45 /* group count for grouped convolution */
50 class ConvolutionOp final : public CUDABackendNode {
52 using wrapper_type = GetCUDABackendWrapperType<T>;
54 ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
55 : stream(std::move(stream_)), cudnnHandle(std::move(handle))
57 const auto& kernel_size = config.kernel_size;
58 const auto& dilations = config.dilations;
59 const auto& strides = config.strides;
61 const auto convolution_order = kernel_size.size();
62 CV_Assert(convolution_order >= 1);
64 CV_Assert(convolution_order == dilations.size());
65 CV_Assert(convolution_order == strides.size());
67 const auto& input_shape = config.input_shape;
68 const auto& output_shape = config.output_shape;
69 CV_Assert(input_shape.size() == output_shape.size());
70 CV_Assert(input_shape.size() == convolution_order + 2);
72 const auto groups = config.groups;
74 if (convolution_order > 3)
75 CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported.");
77 const auto rank = input_shape.size();
78 const auto output_feature_maps = output_shape[1];
79 const auto input_feature_maps = input_shape[1];
80 const auto input_feature_maps_per_group = input_feature_maps / groups;
81 CV_Assert(input_feature_maps % groups == 0);
83 filtersTensor = csl::makeTensorHeader<T>(filters);
84 csl::copyMatToTensor<T>(filters, filtersTensor, stream);
88 biasTensor = csl::makeTensorHeader<T>(bias);
89 csl::copyMatToTensor<T>(bias, biasTensor, stream);
92 /* left and right are misleading as the padding is applicable for any number of dimensions
93 * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
95 * `common_padding` contains the amount of padding that has to be added to both sides
96 * `padding_left` and `padding_right` contains the amount of padding that needs to be added
97 * to a particular side in addition to the common padding
99 std::vector<std::size_t> common_padding(rank, 0);
100 std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
101 if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
103 const auto& pads_begin = config.pads_begin;
104 const auto& pads_end = config.pads_end;
106 CV_Assert(convolution_order == pads_begin.size());
107 CV_Assert(convolution_order == pads_end.size());
109 for (int i = 2; i < common_padding.size(); i++)
111 common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
112 padding_left[i] = pads_begin[i - 2] - common_padding[i];
113 padding_right[i] = pads_end[i - 2] - common_padding[i];
116 else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
118 /* nothing to do as the paddings are already preset to zero */
120 else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
123 * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
125 * if total padding is odd, the extra is added towards the end
127 for (int i = 2; i < rank; i++)
129 const auto j = i - 2; /* filter index */
130 const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
131 const auto required_total_padding =
132 std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
134 common_padding[i] = required_total_padding / 2;
136 padding_right[i] = required_total_padding % 2;
140 /* in some scenarios, the extra padding at the end may not change the output at all */
141 for (int i = 2; i < rank; i++) {
142 const auto j = i - 2; /* filter idx */
143 const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
144 const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
145 std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
147 /* the output shape doesn't change if we decrease the total padding by at most `rem`
148 * provided that we decrease from the right
150 if (rem && padding_right[i] > 0)
151 padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
154 auto is_not_zero = [](std::size_t i) { return i != 0; };
155 if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
156 std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
158 /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
159 * copying the input to a bigger tensor and padding the ends manually
161 transformed_shape = input_shape;
162 for (int i = 0; i < rank; i++)
163 transformed_shape[i] += padding_left[i] + padding_right[i];
165 inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
168 typename csl::Convolution<T>::params_type params;
169 if (transformed_shape.empty())
171 params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
175 /* the convolution operation will be seeing the transformed input */
176 params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
179 auto& fshape = params.filter_shape;
181 fshape[0] = output_feature_maps;
182 fshape[1] = input_feature_maps_per_group;
183 std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
184 CV_Assert(fshape.size() == kernel_size.size() + 2);
186 params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
187 params.stride = strides;
188 params.dilation = dilations;
189 params.groups = config.groups;
191 convoluter = csl::Convolution<T>(cudnnHandle, params);
193 csl::WorkspaceBuilder builder;
194 if (!transformed_shape.empty()) {
195 auto& shape = transformed_shape;
196 auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
197 builder.require<T>(sz);
199 builder.require(convoluter.get_workspace_size());
200 scratch_mem_in_bytes = builder.required_workspace_size();
204 const std::vector<cv::Ptr<BackendWrapper>>& inputs,
205 const std::vector<cv::Ptr<BackendWrapper>>& outputs,
206 csl::Workspace& workspace) override
208 CV_Assert(inputs.size() == 1 && outputs.size() == 1);
210 csl::WorkspaceAllocator allocator(workspace);
212 auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
213 auto input = input_wrapper->getView();
215 if (!transformed_shape.empty())
217 auto& shape = transformed_shape;
218 auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
219 inputTransformer.transform(input, transformed_input);
220 input = transformed_input;
223 auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
224 auto output = output_wrapper->getSpan();
226 convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
227 if (!biasTensor.empty())
229 std::size_t inner_size = output.size_range(2, output.rank());
230 kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
234 std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
238 csl::cudnn::Handle cudnnHandle;
239 csl::Tensor<T> filtersTensor, biasTensor;
240 csl::Convolution<T> convoluter;
242 std::vector<std::size_t> transformed_shape;
243 csl::TensorTransform<T> inputTransformer;
245 std::size_t scratch_mem_in_bytes;
248 }}} /* namespace cv::dnn::cuda4dnn */
250 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */