modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
   6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
   7
   8 #include "stream.hpp"
   9 #include "tensor.hpp"
  10 #include "pointer.hpp"
  11 #include "cublas.hpp"
  12 #include "cudnn.hpp"
  13 #include "workspace.hpp"
  14
  15 #include "cudnn/convolution.hpp"
  16 #include "cudnn/pooling.hpp"
  17 #include "cudnn/lrn.hpp"
  18 #include "cudnn/softmax.hpp"
  19 #include "cudnn/transform.hpp"
  20 #include "cudnn/transpose_convolution.hpp"
  21
  22 #include <opencv2/core.hpp>
  23
  24 #include <cstddef>
  25 #include <array>
  26 #include <vector>
  27 #include <algorithm>
  28
  29 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
  30
  31     namespace tensor_ops {
  32
  33         /** @brief copies data between tensors
  34          *
  35          * Pre-conditions:
  36          * - \p dest and \p src must have the same shape
  37          *
  38          * Exception Gaurantee: Basic
  39          */
  40         template <class T> inline
  41         void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
  42             CV_Assert(is_shape_same(dest, src));
  43             if (dest.get() != src.get())
  44                 memcpy(dest.get(), src.get(), dest.size(), stream);
  45         }
  46
  47         /** @brief performs generalized matrix-multiplication
  48          *
  49          * Pre-conditions:
  50          * - \p A and \p B must meet the mathematical requirements for matrix multiplication
  51          * - \p result must be large enough to hold the result
  52          *
  53          * Exception Gaurantee: Basic
  54          */
  55         template <class T> inline
  56         void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
  57             /* matrix operations can be performed only on rank two or less tensors */
  58             CV_Assert(get_effective_rank(A) <= 2 &&
  59                 get_effective_rank(B) <= 2 &&
  60                 get_effective_rank(result) <= 2);
  61
  62             /* check dimension requirements for matrix multiplication */
  63             if (!transa && !transb) {
  64                 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
  65                 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
  66                 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
  67             } else if (!transa && transb) {
  68                 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
  69                 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
  70                 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
  71             } else if (transa && !transb) {
  72                 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
  73                 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
  74                 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
  75             } else {
  76                 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
  77                 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
  78                 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
  79             }
  80
  81             const auto result_nr = result.get_axis_size(-2);
  82             const auto result_nc = result.get_axis_size(-1);
  83             const auto common_dim = A.get_axis_size(transa ? -2 : -1);
  84             const auto A_nc = A.get_axis_size(-1);
  85             const auto B_nc = B.get_axis_size(-1);
  86
  87             /* tensors are stored in row-major but cublas::gemm operates on column-major matrices
  88              * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
  89              *
  90              * Required: C = AB
  91              * what cuBLAS sees: C^T = A^TB^T = (BA)^T
  92              *
  93              * By reversing operands, we effectively perform:
  94              * C^T = B^TA^T = (AB)^T
  95              *
  96              * which gives C = AB
  97              */
  98             cublas::gemm<T>(handle,
  99                 transb, transa,
 100                 result_nc, result_nr, common_dim,
 101                 alpha, B.get(), B_nc,
 102                 A.get(), A_nc,
 103                 beta, result.get(), result_nc);
 104         }
 105
 106         /** @brief performs element-wise addition with broadcasting
 107          *
 108          * Pre-conditions:
 109          * - \p A and \p result must be compatible tensors
 110          *
 111          * Exception Gaurantee: Basic
 112          */
 113         template <class T> inline
 114         void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
 115             CV_Assert(is_shape_same(output, input));
 116
 117             channel_axis = clamp_axis(channel_axis, input.rank());
 118
 119             std::size_t outer_size = input.size_range(0, channel_axis);
 120             auto channel_size = input.get_axis_size(channel_axis);
 121             std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
 122
 123             std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
 124
 125             using cudnn::TensorDescriptor;
 126             auto inputDesc = TensorDescriptor<T>(shape);
 127             auto outputDesc = TensorDescriptor<T>(shape);
 128             cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
 129         }
 130     }
 131
 132     template <class T>
 133     class Convolution {
 134         using TensorDescriptor = cudnn::TensorDescriptor<T>;
 135         using FilterDescriptor = cudnn::FilterDescriptor<T>;
 136         using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
 137         using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
 138
 139     public:
 140         struct params_type {
 141             std::vector<std::size_t> input_shape;
 142             std::vector<std::size_t> filter_shape;
 143
 144             std::vector<std::size_t> padding;
 145             std::vector<std::size_t> stride;
 146             std::vector<std::size_t> dilation;
 147
 148             std::size_t groups;
 149         };
 150
 151         Convolution() = default;
 152         Convolution(const Convolution&) = delete;
 153         Convolution(Convolution&&) = default;
 154         Convolution(cudnn::Handle handle, const params_type& params) {
 155             cudnnHandle = std::move(handle);
 156
 157             inputTensorDesc = TensorDescriptor(params.input_shape);
 158             filterDesc = FilterDescriptor(params.filter_shape);
 159             convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
 160
 161             std::vector<int> output_dims;
 162             getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
 163             outputTensorDesc = TensorDescriptor(output_dims);
 164
 165             algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
 166         }
 167
 168         Convolution& operator=(const Convolution&) = delete;
 169         Convolution& operator=(Convolution&&) = default;
 170
 171         std::size_t get_workspace_size() const noexcept {
 172             return algo.get_workspace_size();
 173         }
 174
 175         void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
 176             cudnn::convolve<T>(
 177                 cudnnHandle,
 178                 convDesc, algo, scratchpad,
 179                 filterDesc, filters.get(),
 180                 inputTensorDesc, input.get(),
 181                 1.0, 0.0, outputTensorDesc, output.get()
 182             );
 183         }
 184
 185     private:
 186         cudnn::Handle cudnnHandle;
 187         TensorDescriptor inputTensorDesc, outputTensorDesc;
 188         FilterDescriptor filterDesc;
 189         ConvolutionDescriptor convDesc;
 190         ConvolutionAlgorithm algo;
 191     };
 192
 193     template <class T>
 194     class TransposeConvolution {
 195         using TensorDescriptor = cudnn::TensorDescriptor<T>;
 196         using FilterDescriptor = cudnn::FilterDescriptor<T>;
 197         using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
 198         using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
 199
 200     public:
 201         struct params_type {
 202             std::vector<std::size_t> input_shape;
 203             std::vector<std::size_t> output_shape;
 204
 205             std::vector<std::size_t> filter_shape;
 206
 207             std::vector<std::size_t> padding;
 208             std::vector<std::size_t> stride;
 209             std::vector<std::size_t> dilation;
 210
 211             std::size_t groups;
 212         };
 213
 214         TransposeConvolution() = default;
 215         TransposeConvolution(const TransposeConvolution&) = delete;
 216         TransposeConvolution(TransposeConvolution&&) = default;
 217         TransposeConvolution(cudnn::Handle handle, const params_type& params) {
 218             cudnnHandle = std::move(handle);
 219
 220             filterDesc = FilterDescriptor(params.filter_shape);
 221             convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
 222
 223             /* input_shape is the output shape for convolution
 224              * output_shape is the input shape for convolution
 225              */
 226             convInputTensorDesc = TensorDescriptor(params.output_shape);
 227
 228             std::vector<int> conv_output_dims;
 229             getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
 230
 231             /* the convolution output must be identical to what cuDNN expects */
 232             CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
 233
 234             convOutputTensorDesc = TensorDescriptor(params.input_shape);
 235
 236             algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
 237         }
 238
 239         TransposeConvolution& operator=(const TransposeConvolution&) = delete;
 240         TransposeConvolution& operator=(TransposeConvolution&&) = default;
 241
 242         std::size_t get_workspace_size() const noexcept {
 243             return algo.get_workspace_size();
 244         }
 245
 246         void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
 247             cudnn::transpose_convolve<T>(
 248                 cudnnHandle,
 249                 convDesc, algo, scratchpad,
 250                 filterDesc, filters.get(),
 251                 convOutputTensorDesc, input.get(),
 252                 1.0, 0.0, convInputTensorDesc, output.get()
 253             );
 254         }
 255
 256     private:
 257         cudnn::Handle cudnnHandle;
 258         TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
 259         FilterDescriptor filterDesc;
 260         ConvolutionDescriptor convDesc;
 261         TransposeConvolutionAlgorithm algo;
 262     };
 263
 264     template <class T>
 265     class Pooling {
 266         using TensorDescriptor = cudnn::TensorDescriptor<T>;
 267         using PoolingDescriptor = cudnn::PoolingDescriptor;
 268
 269     public:
 270         using PoolingType = PoolingDescriptor::PoolingType;
 271
 272         struct params_type {
 273             std::vector<std::size_t> input_shape;
 274             std::vector<std::size_t> output_shape;
 275
 276             std::vector<std::size_t> window_size;
 277             std::vector<std::size_t> padding;
 278             std::vector<std::size_t> stride;
 279
 280             PoolingType type;
 281         };
 282
 283         Pooling() = default;
 284         Pooling(const Pooling&) = delete;
 285         Pooling(Pooling&&) = default;
 286         Pooling(cudnn::Handle handle, const params_type& params) {
 287             cudnnHandle = std::move(handle);
 288
 289             inputTensorDesc = TensorDescriptor(params.input_shape);
 290             poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
 291
 292             //std::vector<int> output_dim;
 293             //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
 294             outputTensorDesc = TensorDescriptor(params.output_shape);
 295         }
 296
 297         Pooling& operator=(const Pooling&) = delete;
 298         Pooling& operator=(Pooling&&) = default;
 299
 300         void pool(TensorView<T> input, TensorSpan<T> output) {
 301             cudnn::pool<T>(
 302                 cudnnHandle,
 303                 poolingDesc,
 304                 inputTensorDesc, input.get(),
 305                 1.0, 0.0, outputTensorDesc, output.get()
 306             );
 307         }
 308
 309     private:
 310         cudnn::Handle cudnnHandle;
 311         TensorDescriptor inputTensorDesc, outputTensorDesc;
 312         PoolingDescriptor poolingDesc;
 313     };
 314
 315     template <class T>
 316     class LRN {
 317         using LRNDescriptor = cudnn::LRNDescriptor;
 318         using TensorDescriptor = cudnn::TensorDescriptor<T>;
 319
 320     public:
 321         using LRNType = LRNDescriptor::LRNType;
 322
 323         LRN() = default;
 324         LRN(const LRN&) = delete;
 325         LRN(LRN&&) = default;
 326         LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
 327             cudnnHandle = std::move(handle);
 328             lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
 329         }
 330
 331         LRN& operator=(const LRN&) = delete;
 332         LRN& operator=(LRN&&) = default;
 333
 334         void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
 335             cudnn::LRNForward<T>(
 336                 cudnnHandle,
 337                 lrnDesc,
 338                 TensorDescriptor(input.shape_as_vector()), input.get(),
 339                 1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
 340                 workspace
 341             );
 342         }
 343
 344     private:
 345         cudnn::Handle cudnnHandle;
 346         LRNDescriptor lrnDesc;
 347     };
 348
 349     template <class T>
 350     class TensorTransform {
 351         using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
 352         using TensorDescriptor = cudnn::TensorDescriptor<T>;
 353
 354     public:
 355         TensorTransform() = default;
 356         TensorTransform(const TensorTransform&) = delete;
 357         TensorTransform(TensorTransform&&) = default;
 358
 359         template <class SequenceContainer>
 360         TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
 361             cudnnHandle = std::move(handle);
 362             transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
 363         }
 364
 365         TensorTransform& operator=(const TensorTransform&) = delete;
 366         TensorTransform& operator=(TensorTransform&&) = default;
 367
 368         void transform(TensorView<T> input, TensorSpan<T> output) {
 369             cudnn::transform<T>(
 370                 cudnnHandle,
 371                 transDesc,
 372                 TensorDescriptor(input.shape_as_vector()), input.get(),
 373                 TensorDescriptor(output.shape_as_vector()), output.get()
 374             );
 375         }
 376
 377     private:
 378         cudnn::Handle cudnnHandle;
 379         TensorTransformDescriptor transDesc;
 380     };
 381
 382 }}}} /* namespace cv::dnn::cuda4dnn::csl */
 383
 384 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */