modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
   6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
   7
   8 #include "cudnn.hpp"
   9
  10 #include "../pointer.hpp"
  11 #include "../workspace.hpp"
  12
  13 #include <cudnn.h>
  14
  15 #include <cstddef>
  16 #include <array>
  17 #include <algorithm>
  18 #include <vector>
  19 #include <type_traits>
  20 #include <iterator>
  21
  22 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
  23
  24     /** describe convolution filters
  25      *
  26      * @tparam  T   type of elements in the kernels
  27      */
  28     template <class T>
  29     class FilterDescriptor {
  30     public:
  31         FilterDescriptor() noexcept : descriptor{ nullptr } { }
  32         FilterDescriptor(const FilterDescriptor&) = delete;
  33         FilterDescriptor(FilterDescriptor&& other) noexcept
  34             : descriptor{ other.descriptor } {
  35             other.descriptor = nullptr;
  36         }
  37
  38         /** constructs a filter descriptor from the filter dimensions provided in \p shape
  39          *
  40          * Shape dimensions:
  41          * 0: number of filters
  42          * 1: number of input feature maps
  43          * 2..n: kernel dimensions
  44          *
  45          * Exception Guarantee: Strong
  46          */
  47         template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
  48         FilterDescriptor(const SequenceContainer& shape) {
  49             constructor(shape.begin(), shape.end());
  50         }
  51
  52         /** constructs a filter descriptor from the filter dimensions provided in [begin, end)
  53          *
  54          * Shape dimensions:
  55          * 0: number of filters
  56          * 1: number of input feature maps
  57          * 2..n: kernel dimensions
  58          *
  59          * Exception Guarantee: Strong
  60          */
  61         template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
  62         FilterDescriptor(ForwardItr begin, ForwardItr end) {
  63             constructor(begin, end);
  64         }
  65
  66         /** constructs a filter descriptor from the filter dimensions provided as arguments
  67          *
  68          * Shape dimensions:
  69          * 0: number of filters
  70          * 1: number of input feature maps
  71          * 2..n: kernel dimensions
  72          *
  73          * Exception Guarantee: Strong
  74          */
  75         template <class ...Sizes>
  76         FilterDescriptor(Sizes ...sizes) {
  77             static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
  78             static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
  79             std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
  80             constructor(std::begin(dims), std::end(dims));
  81         }
  82
  83         ~FilterDescriptor() noexcept {
  84             if (descriptor != nullptr) {
  85                 /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
  86                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
  87             }
  88         }
  89
  90         FilterDescriptor& operator=(const FilterDescriptor&) = delete;
  91         FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
  92             descriptor = other.descriptor;
  93             other.descriptor = nullptr;
  94             return *this;
  95         };
  96
  97         cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
  98
  99     private:
 100         template <class ForwardItr>
 101         void constructor(ForwardItr start, ForwardItr end) {
 102             CV_Assert(start != end);
 103             CV_Assert(std::distance(start, end) >= 3);
 104             CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
 105
 106             CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
 107             try {
 108                 const auto rank = std::distance(start, end);
 109                 if (rank == 4) {
 110                     std::array<int, 4> dims;
 111                     std::copy(start, end, std::begin(dims));
 112                     CUDA4DNN_CHECK_CUDNN(
 113                         cudnnSetFilter4dDescriptor(
 114                             descriptor,
 115                             detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
 116                             dims[0], dims[1], dims[2], dims[3]
 117                         )
 118                     );
 119                 } else {
 120                     std::vector<int> dims(start, end);
 121                     CUDA4DNN_CHECK_CUDNN(
 122                         cudnnSetFilterNdDescriptor(
 123                             descriptor,
 124                             detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
 125                             dims.size(), dims.data()
 126                         )
 127                     );
 128                 }
 129             } catch (...) {
 130                 /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
 131                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
 132                 throw;
 133             }
 134         }
 135
 136         cudnnFilterDescriptor_t descriptor;
 137     };
 138
 139     /** describes a convolution operation
 140      *
 141      * @tparam  T   type of element participating in convolution
 142      */
 143     template <class T>
 144     class ConvolutionDescriptor {
 145     public:
 146         ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
 147         ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
 148         ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
 149             : descriptor{ other.descriptor } {
 150             other.descriptor = nullptr;
 151         }
 152
 153         /** constructs a convolution descriptor
 154          *
 155          * Pre-conditions:
 156          * - \p zero_padding, \p stride and \p dilation must have the same size
 157          *
 158          * The length of the containers is interpreted as the order of the convolution.
 159          *
 160          * Exception Guarantee: Strong
 161          */
 162         template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
 163         ConvolutionDescriptor(
 164             const SequenceContainer& zero_padding,
 165             const SequenceContainer& stride,
 166             const SequenceContainer& dilation,
 167             std::size_t group_count)
 168         {
 169             constructor(zero_padding, stride, dilation, group_count);
 170         }
 171
 172         ~ConvolutionDescriptor() noexcept {
 173             if (descriptor != nullptr) {
 174                 /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
 175                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
 176             }
 177         }
 178
 179         ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
 180         ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
 181             descriptor = other.descriptor;
 182             other.descriptor = nullptr;
 183             return *this;
 184         };
 185
 186         cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
 187
 188     private:
 189         template <class SequenceContainer>
 190         void constructor(
 191             const SequenceContainer& zero_padding,
 192             const SequenceContainer& stride,
 193             const SequenceContainer& dilation,
 194             std::size_t group_count)
 195         {
 196             CV_Assert(zero_padding.size() == stride.size());
 197             CV_Assert(zero_padding.size() == dilation.size());
 198
 199             CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
 200             try {
 201                 const auto rank = zero_padding.size();
 202                 if (rank == 2) {
 203                     CUDA4DNN_CHECK_CUDNN(
 204                         cudnnSetConvolution2dDescriptor(
 205                             descriptor,
 206                             zero_padding[0], zero_padding[1],
 207                             stride[0], stride[1],
 208                             dilation[0], dilation[1],
 209                             CUDNN_CROSS_CORRELATION,
 210                             detail::get_data_type<T>()
 211                         )
 212                     );
 213                 } else {
 214                     std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
 215                     std::vector<int> istride(std::begin(stride), std::end(stride));
 216                     std::vector<int> idilation(std::begin(dilation), std::end(dilation));
 217                     CUDA4DNN_CHECK_CUDNN(
 218                         cudnnSetConvolutionNdDescriptor(
 219                             descriptor,
 220                             rank, ipadding.data(), istride.data(), idilation.data(),
 221                             CUDNN_CROSS_CORRELATION,
 222                             detail::get_data_type<T>()
 223                         )
 224                     );
 225                 }
 226                 CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
 227             } catch (...) {
 228                 /* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */
 229                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
 230                 throw;
 231             }
 232         }
 233
 234         cudnnConvolutionDescriptor_t descriptor;
 235     };
 236
 237     /** wrapper around a convolution algorithm
 238      *
 239      * @tparam  T   type of elements being convolved
 240      */
 241     template <class T>
 242     class ConvolutionAlgorithm {
 243     public:
 244         ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
 245         ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
 246         ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
 247
 248         /** selects a good algorithm for convolution for given configuration
 249          *
 250          * Exception Guarantee: Strong
 251          */
 252         ConvolutionAlgorithm(
 253             const Handle& handle,
 254             const ConvolutionDescriptor<T>& conv,
 255             const FilterDescriptor<T>& filter,
 256             const TensorDescriptor<T>& input,
 257             const TensorDescriptor<T>& output)
 258         {
 259             CUDA4DNN_CHECK_CUDNN(
 260                 cudnnGetConvolutionForwardAlgorithm(
 261                     handle.get(),
 262                     input.get(), filter.get(), conv.get(), output.get(),
 263                     CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
 264                     0, /* no memory limit */
 265                     &algo
 266                 )
 267             );
 268
 269             CUDA4DNN_CHECK_CUDNN(
 270                 cudnnGetConvolutionForwardWorkspaceSize(
 271                     handle.get(),
 272                     input.get(), filter.get(), conv.get(), output.get(),
 273                     algo, &workspace_size
 274                 )
 275             );
 276         }
 277
 278         ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
 279         ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
 280
 281         cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
 282
 283         /** number of bytes of workspace memory required by the algorithm */
 284         std::size_t get_workspace_size() const noexcept { return workspace_size; }
 285
 286     private:
 287         cudnnConvolutionFwdAlgo_t algo;
 288         std::size_t workspace_size;
 289     };
 290
 291     /** gives the shape of the output tensor of convolution
 292      *
 293      * Exception Guarantee: Basic
 294      */
 295     template <class T>
 296     void getConvolutionForwardOutputDim(
 297         const ConvolutionDescriptor<T>& convDesc,
 298         const FilterDescriptor<T>& filterDesc,
 299         const TensorDescriptor<T>& inputDesc,
 300         std::vector<int>& output)
 301     {
 302         output.clear();
 303         output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
 304
 305         std::vector<int> temp(CUDNN_DIM_MAX);
 306         cudnnDataType_t tempDataType;
 307         CUDA4DNN_CHECK_CUDNN(
 308             cudnnGetTensorNdDescriptor(
 309                 inputDesc.get(),
 310                 CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
 311                 &tempDataType,
 312                 output.data(),
 313                 temp.data(),
 314                 temp.data()
 315             )
 316         );
 317
 318         const auto rank = output[0];
 319         output.resize(rank);
 320         CUDA4DNN_CHECK_CUDNN(
 321             cudnnGetConvolutionNdForwardOutputDim(
 322                 convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
 323             )
 324         );
 325     }
 326
 327     /** @brief performs convolution
 328      *
 329      * dstValue = alpha * result + beta * priorDstValue
 330      *
 331      * @tparam          T           convolution element type (must be `half` or `float`)
 332      *
 333      * @param           handle      valid cuDNN Handle
 334      * @param           convDesc    convolution description
 335      * @param           convAlgo    algorithm to use for convolution
 336      * @param           workspace   workspace memory which meets the requirements of \p convAlgo
 337      * @param           filterDesc  filter descriptor
 338      * @param[in]       filterPtr   pointer to device memory containing the filters
 339      * @param           inputDesc   tensor descriptor describing the input
 340      * @param[in]       inputPtr    pointer to input tensor in device memory
 341      * @param           alpha       result scale factor
 342      * @param           beta        previous value scale factor
 343      * @param           outputDesc  tensor descriptor describing the output
 344      * @param[out]      outputPtr   pointer to output tensor in device memory
 345      *
 346      * Exception Guarantee: Basic
 347      */
 348     template <class T>
 349     void convolve(
 350         const Handle& handle,
 351         const ConvolutionDescriptor<T>& convDesc,
 352         const ConvolutionAlgorithm<T>& convAlgo,
 353         WorkspaceInstance workspace,
 354         const FilterDescriptor<T>& filterDesc,
 355         DevicePtr<const T> filterPtr,
 356         const TensorDescriptor<T>& inputDesc,
 357         DevicePtr<const T> inputPtr,
 358         T alpha, T beta,
 359         const TensorDescriptor<T>& outputDesc,
 360         DevicePtr<T> outputPtr)
 361     {
 362         CV_Assert(handle);
 363
 364         CUDA4DNN_CHECK_CUDNN(
 365             cudnnConvolutionForward(
 366                 handle.get(),
 367                 &alpha, inputDesc.get(), inputPtr.get(),
 368                 filterDesc.get(), filterPtr.get(),
 369                 convDesc.get(), convAlgo.get(),
 370                 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
 371                 &beta, outputDesc.get(), outputPtr.get()
 372             )
 373         );
 374     }
 375
 376     template <> inline
 377     void convolve(
 378         const Handle& handle,
 379         const ConvolutionDescriptor<half>& convDesc,
 380         const ConvolutionAlgorithm<half>& convAlgo,
 381         WorkspaceInstance workspace,
 382         const FilterDescriptor<half>& filterDesc,
 383         DevicePtr<const half> filterPtr,
 384         const TensorDescriptor<half>& inputDesc,
 385         DevicePtr<const half> inputPtr,
 386         half alpha, half beta,
 387         const TensorDescriptor<half>& outputDesc,
 388         DevicePtr<half> outputPtr)
 389     {
 390         CV_Assert(handle);
 391
 392         /* we specalize for fp16 as the scaling factors must be provided as `float` */
 393         float alpha_ = alpha, beta_ = beta;
 394         CUDA4DNN_CHECK_CUDNN(
 395             cudnnConvolutionForward(
 396                 handle.get(),
 397                 &alpha_, inputDesc.get(), inputPtr.get(),
 398                 filterDesc.get(), filterPtr.get(),
 399                 convDesc.get(), convAlgo.get(),
 400                 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
 401                 &beta_, outputDesc.get(), outputPtr.get()
 402             )
 403         );
 404     }
 405
 406 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
 407
 408 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */