modules/dnn/src/cuda4dnn/primitives/convolution.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
   6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
   7
   8 #include "../../op_cuda.hpp"
   9
  10 #include "../csl/cudnn.hpp"
  11 #include "../csl/stream.hpp"
  12 #include "../csl/tensor.hpp"
  13 #include "../csl/tensor_ops.hpp"
  14 #include "../kernels/scale_shift.hpp"
  15
  16 #include <opencv2/core.hpp>
  17
  18 #include <cstddef>
  19 #include <cstdint>
  20 #include <vector>
  21 #include <utility>
  22 #include <algorithm>
  23
  24 namespace cv { namespace dnn { namespace cuda4dnn {
  25
  26     struct ConvolutionConfiguration {
  27         /* the size of the following vectors must be equal to the kernel size */
  28         std::vector<std::size_t> kernel_size;
  29         std::vector<std::size_t> dilations, strides;
  30
  31         enum class PaddingMode {
  32             MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
  33             VALID, /* no padding is added */
  34             SAME /* TensorFlow logic is used for same padding */
  35         };
  36
  37         /* explicit paddings are used if and only if padMode is set to manual */
  38         PaddingMode padMode;
  39         std::vector<std::size_t> pads_begin, pads_end;
  40
  41         /* full shape inclusive of channel and batch axis */
  42         std::vector<std::size_t> input_shape;
  43         std::vector<std::size_t> output_shape;
  44
  45         /* group count for grouped convolution */
  46         std::size_t groups;
  47     };
  48
  49     template <class T>
  50     class ConvolutionOp final : public CUDABackendNode {
  51     public:
  52         using wrapper_type = GetCUDABackendWrapperType<T>;
  53
  54         ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
  55             : stream(std::move(stream_)), cudnnHandle(std::move(handle))
  56         {
  57             const auto& kernel_size = config.kernel_size;
  58             const auto& dilations = config.dilations;
  59             const auto& strides = config.strides;
  60
  61             const auto convolution_order = kernel_size.size();
  62             CV_Assert(convolution_order >= 1);
  63
  64             CV_Assert(convolution_order == dilations.size());
  65             CV_Assert(convolution_order == strides.size());
  66
  67             const auto& input_shape = config.input_shape;
  68             const auto& output_shape = config.output_shape;
  69             CV_Assert(input_shape.size() == output_shape.size());
  70             CV_Assert(input_shape.size() == convolution_order + 2);
  71
  72             const auto groups = config.groups;
  73
  74             if (convolution_order > 3)
  75                 CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported.");
  76
  77             const auto rank = input_shape.size();
  78             const auto output_feature_maps = output_shape[1];
  79             const auto input_feature_maps = input_shape[1];
  80             const auto input_feature_maps_per_group = input_feature_maps / groups;
  81             CV_Assert(input_feature_maps % groups == 0);
  82
  83             filtersTensor = csl::makeTensorHeader<T>(filters);
  84             csl::copyMatToTensor<T>(filters, filtersTensor, stream);
  85
  86             if (!bias.empty())
  87             {
  88                 biasTensor = csl::makeTensorHeader<T>(bias);
  89                 csl::copyMatToTensor<T>(bias, biasTensor, stream);
  90             }
  91
  92             /* left and right are misleading as the padding is applicable for any number of dimensions
  93              * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
  94              *
  95              * `common_padding` contains the amount of padding that has to be added to both sides
  96              * `padding_left` and `padding_right` contains the amount of padding that needs to be added
  97              * to a particular side in addition to the common padding
  98              */
  99             std::vector<std::size_t> common_padding(rank, 0);
 100             std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
 101             if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
 102             {
 103                 const auto& pads_begin = config.pads_begin;
 104                 const auto& pads_end = config.pads_end;
 105
 106                 CV_Assert(convolution_order == pads_begin.size());
 107                 CV_Assert(convolution_order == pads_end.size());
 108
 109                 for (int i = 2; i < common_padding.size(); i++)
 110                 {
 111                     common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
 112                     padding_left[i] = pads_begin[i - 2] - common_padding[i];
 113                     padding_right[i] = pads_end[i - 2] - common_padding[i];
 114                 }
 115             }
 116             else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
 117             {
 118                 /* nothing to do as the paddings are already preset to zero */
 119             }
 120             else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
 121             {
 122                 /* TensorFlow Logic:
 123                  * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
 124                  *
 125                  * if total padding is odd, the extra is added towards the end
 126                  */
 127                 for (int i = 2; i < rank; i++)
 128                 {
 129                     const auto j = i - 2; /* filter index */
 130                     const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
 131                     const auto required_total_padding =
 132                         std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
 133
 134                     common_padding[i] = required_total_padding / 2;
 135                     padding_left[i] = 0;
 136                     padding_right[i] = required_total_padding % 2;
 137                 }
 138             }
 139
 140             /* in some scenarios, the extra padding at the end may not change the output at all */
 141             for (int i = 2; i < rank; i++) {
 142                 const auto j = i - 2; /* filter idx */
 143                 const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
 144                 const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
 145                 std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
 146
 147                 /* the output shape doesn't change if we decrease the total padding by at most `rem`
 148                  * provided that we decrease from the right
 149                  */
 150                 if (rem && padding_right[i] > 0)
 151                     padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
 152             }
 153
 154             auto is_not_zero = [](std::size_t i) { return i != 0; };
 155             if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
 156                std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
 157             {
 158                 /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
 159                  * copying the input to a bigger tensor and padding the ends manually
 160                  */
 161                 transformed_shape = input_shape;
 162                 for (int i = 0; i < rank; i++)
 163                     transformed_shape[i] += padding_left[i] + padding_right[i];
 164
 165                 inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
 166             }
 167
 168             typename csl::Convolution<T>::params_type params;
 169             if (transformed_shape.empty())
 170             {
 171                 params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 172             }
 173             else
 174             {
 175                 /* the convolution operation will be seeing the transformed input */
 176                 params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
 177             }
 178
 179             auto& fshape = params.filter_shape;
 180             fshape.resize(rank);
 181             fshape[0] = output_feature_maps;
 182             fshape[1] = input_feature_maps_per_group;
 183             std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
 184             CV_Assert(fshape.size() == kernel_size.size() + 2);
 185
 186             params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
 187             params.stride = strides;
 188             params.dilation = dilations;
 189             params.groups = config.groups;
 190
 191             convoluter = csl::Convolution<T>(cudnnHandle, params);
 192
 193             csl::WorkspaceBuilder builder;
 194             if (!transformed_shape.empty()) {
 195                 auto& shape = transformed_shape;
 196                 auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
 197                 builder.require<T>(sz);
 198             }
 199             builder.require(convoluter.get_workspace_size());
 200             scratch_mem_in_bytes = builder.required_workspace_size();
 201         }
 202
 203         void forward(
 204             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
 205             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
 206             csl::Workspace& workspace) override
 207         {
 208             CV_Assert(inputs.size() == 1 && outputs.size() == 1);
 209
 210             csl::WorkspaceAllocator allocator(workspace);
 211
 212             auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
 213             auto input = input_wrapper->getView();
 214
 215             if (!transformed_shape.empty())
 216             {
 217                 auto& shape = transformed_shape;
 218                 auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
 219                 inputTransformer.transform(input, transformed_input);
 220                 input = transformed_input;
 221             }
 222
 223             auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
 224             auto output = output_wrapper->getSpan();
 225
 226             convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
 227             if (!biasTensor.empty())
 228             {
 229                 std::size_t inner_size = output.size_range(2, output.rank());
 230                 kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
 231             }
 232         }
 233
 234         std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
 235
 236     private:
 237         csl::Stream stream;
 238         csl::cudnn::Handle cudnnHandle;
 239         csl::Tensor<T> filtersTensor, biasTensor;
 240         csl::Convolution<T> convoluter;
 241
 242         std::vector<std::size_t> transformed_shape;
 243         csl::TensorTransform<T> inputTransformer;
 244
 245         std::size_t scratch_mem_in_bytes;
 246     };
 247
 248 }}} /* namespace cv::dnn::cuda4dnn */
 249
 250 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */