modules/dnn/src/cuda4dnn/primitives/pooling.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
   6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
   7
   8 #include "../../op_cuda.hpp"
   9
  10 #include "../csl/cudnn.hpp"
  11 #include "../csl/tensor.hpp"
  12 #include "../csl/tensor_ops.hpp"
  13
  14 #include <opencv2/core.hpp>
  15
  16 #include <cstddef>
  17 #include <cstdint>
  18 #include <vector>
  19 #include <utility>
  20 #include <algorithm>
  21
  22 namespace cv { namespace dnn { namespace cuda4dnn {
  23
  24     struct PoolingConfiguration {
  25         enum class PoolingMode {
  26             MAX,
  27             AVERAGE_INCLUDE_PADDING, /* include padding while calculating average */
  28             AVERAGE_EXCLUDE_PADDING /* exclude padding while calculating average */
  29         };
  30
  31         PoolingMode poolMode;
  32
  33         /* the size of the following vectors must be equal to the window size */
  34         std::vector<std::size_t> window_size;
  35         std::vector<std::size_t> strides;
  36
  37         enum class PaddingMode {
  38             MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
  39             VALID, /* no padding is added */
  40             SAME /* TensorFlow logic is used for same padding */
  41         };
  42
  43         PaddingMode padMode;
  44
  45         /* explicit paddings are used if and only if padMode is set to manual */
  46         std::vector<std::size_t> pads_begin, pads_end;
  47
  48         /* the output shape is calculated using the following formula:
  49          * output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1
  50          *
  51          * rounding mode decides what is used as `func`
  52          */
  53         enum class RoundingMode {
  54             CEIL, /* uses ceil */
  55             FLOOR
  56         };
  57
  58         RoundingMode roundMode;
  59
  60         /* full shape inclusive of channel and batch axis */
  61         std::vector<std::size_t> input_shape;
  62     };
  63
  64     template <class T>
  65     class PoolingOp final : public CUDABackendNode {
  66     public:
  67         using wrapper_type = GetCUDABackendWrapperType<T>;
  68
  69         PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config)
  70             : cudnnHandle(std::move(handle))
  71         {
  72             const auto& window_size = config.window_size;
  73
  74             const auto pooling_order = window_size.size();
  75             CV_Assert(pooling_order >= 1);
  76
  77             const auto& strides = config.strides;
  78             CV_Assert(pooling_order == strides.size());
  79
  80             const auto& input_shape = config.input_shape;
  81             CV_Assert(input_shape.size() == pooling_order + 2);
  82
  83             if (pooling_order > 3)
  84                 CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported.");
  85
  86             const auto rank = input_shape.size();
  87
  88             /* left and right are misleading as the padding is applicable for any number of dimensions
  89              * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
  90              *
  91              * `common_padding` contains the amount of padding that has to be added to both sides
  92              * `padding_left` and `padding_right` contains the amount of padding that needs to be added
  93              * to a particular side in addition to the common padding
  94              */
  95             std::vector<std::size_t> common_padding(rank, 0);
  96             std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
  97             if (config.padMode == PoolingConfiguration::PaddingMode::MANUAL)
  98             {
  99                 const auto& pads_begin = config.pads_begin;
 100                 const auto& pads_end = config.pads_end;
 101
 102                 CV_Assert(pooling_order == pads_begin.size());
 103                 CV_Assert(pooling_order == pads_end.size());
 104
 105                 /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing
 106                  * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds
 107                  * the correct output size without having to deal with fancy fractional sizes
 108                  */
 109                 auto pads_end_modified = pads_end;
 110                 if (config.roundMode == PoolingConfiguration::RoundingMode::CEIL)
 111                 {
 112                     for (int i = 0; i < window_size.size(); i++) {
 113                         auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i];
 114                         if (rem)
 115                             pads_end_modified[i] += strides[i] - rem;
 116                     }
 117                 }
 118
 119                 for (int i = 2; i < common_padding.size(); i++)
 120                 {
 121                     common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]);
 122                     padding_left[i] = pads_begin[i - 2] - common_padding[i];
 123                     padding_right[i] = pads_end_modified[i - 2] - common_padding[i];
 124                 }
 125             }
 126             else if (config.padMode == PoolingConfiguration::PaddingMode::VALID)
 127             {
 128                 /* nothing to do as the paddings are already preset to zero */
 129             }
 130             else if (config.padMode == PoolingConfiguration::PaddingMode::SAME)
 131             {
 132                 /* TensorFlow Logic:
 133                  * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
 134                  *
 135                  * if total padding is odd, the extra is added towards the end
 136                  */
 137                 for (int i = 2; i < rank; i++)
 138                 {
 139                     const auto j = i - 2; /* filter index */
 140                     const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j];
 141                     const auto required_total_padding =
 142                         std::max<std::int64_t>(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]);
 143
 144                     common_padding[i] = required_total_padding / 2;
 145                     padding_left[i] = 0;
 146                     padding_right[i] = required_total_padding % 2;
 147                 }
 148             }
 149
 150             /* in some scenarios, the extra padding at the end may not change the output at all */
 151             for (int i = 2; i < rank; i++) {
 152                 const auto j = i - 2; /* filter idx */
 153                 const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
 154                 std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j];
 155
 156                 /* the output shape doesn't change if we decrease the total padding by at most `rem`
 157                  * provided that we decrease from the right
 158                  */
 159                 if (rem && padding_right[i] > 0)
 160                     padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
 161             }
 162
 163             auto is_not_zero = [](std::size_t i) { return i != 0; };
 164             if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
 165                 std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
 166             {
 167                 /* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by
 168                  * copying the input to a bigger tensor and padding the ends manually
 169                  *
 170                  * But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or
 171                  * a bigger output shape. This effectively allows us to have arbitary padding at the right.
 172                  */
 173                 if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero))
 174                 {
 175                     /* there is padding on the left and we are forced to transform */
 176                     auto transformed_input_shape = input_shape;
 177                     for (int i = 0; i < rank; i++)
 178                         transformed_input_shape[i] += padding_left[i] + padding_right[i];
 179
 180                     transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape));
 181                     inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
 182                 }
 183             }
 184
 185             typename csl::Pooling<T>::params_type params;
 186             if (transformedInput.empty())
 187             {
 188                 /* no transform => use original input shape */
 189                 params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 190             }
 191             else
 192             {
 193                 /* the pooling operation will be seeing the transformed input */
 194                 auto transformed_input_shape = transformedInput.shape_as_vector();
 195                 params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape));
 196             }
 197
 198             auto output_shape = input_shape;
 199             for (int i = 2; i < rank; i++)
 200             {
 201                 auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
 202                 output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1;
 203             }
 204
 205             params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
 206             params.window_size = window_size;
 207             params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
 208             params.stride = strides;
 209
 210             if (config.poolMode == PoolingConfiguration::PoolingMode::MAX)
 211             {
 212                 params.type = csl::Pooling<T>::PoolingType::MAX;
 213             }
 214             else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING)
 215             {
 216                 params.type = csl::Pooling<T>::PoolingType::AVERAGE_INCLUDE_PADDING;
 217             }
 218             else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING)
 219             {
 220                 params.type = csl::Pooling<T>::PoolingType::AVERAGE_EXCLUDE_PADDING;
 221             }
 222
 223             pooler = csl::Pooling<T>(cudnnHandle, params);
 224         }
 225
 226         void forward(
 227             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
 228             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
 229             csl::Workspace& workspace) override
 230         {
 231             CV_Assert(inputs.size() == 1 && outputs.size() == 1);
 232
 233             auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
 234             auto input = input_wrapper->getView();
 235
 236             if (!transformedInput.empty())
 237             {
 238                 inputTransformer.transform(input, transformedInput);
 239                 input = csl::TensorView<T>(transformedInput);
 240             }
 241
 242             auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
 243             auto output = output_wrapper->getSpan();
 244
 245             pooler.pool(input, output);
 246         }
 247
 248     private:
 249         csl::cudnn::Handle cudnnHandle;
 250         csl::Pooling<T> pooler;
 251
 252         csl::Tensor<T> transformedInput;
 253         csl::TensorTransform<T> inputTransformer;
 254     };
 255
 256 }}} /* namespace cv::dnn::cuda4dnn */
 257
 258 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP */