Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / cuda4dnn / csl / tensor_ops.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
7
8 #include "stream.hpp"
9 #include "tensor.hpp"
10 #include "pointer.hpp"
11 #include "cublas.hpp"
12 #include "cudnn.hpp"
13 #include "workspace.hpp"
14
15 #include "cudnn/convolution.hpp"
16 #include "cudnn/pooling.hpp"
17 #include "cudnn/lrn.hpp"
18 #include "cudnn/softmax.hpp"
19 #include "cudnn/transform.hpp"
20 #include "cudnn/transpose_convolution.hpp"
21
22 #include <opencv2/core.hpp>
23
24 #include <cstddef>
25 #include <array>
26 #include <vector>
27 #include <algorithm>
28
29 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
30
31     namespace tensor_ops {
32
33         /** @brief copies data between tensors
34          *
35          * Pre-conditions:
36          * - \p dest and \p src must have the same shape
37          *
38          * Exception Gaurantee: Basic
39          */
40         template <class T> inline
41         void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
42             CV_Assert(is_shape_same(dest, src));
43             if (dest.get() != src.get())
44                 memcpy(dest.get(), src.get(), dest.size(), stream);
45         }
46
47         /** @brief performs generalized matrix-multiplication
48          *
49          * Pre-conditions:
50          * - \p A and \p B must meet the mathematical requirements for matrix multiplication
51          * - \p result must be large enough to hold the result
52          *
53          * Exception Gaurantee: Basic
54          */
55         template <class T> inline
56         void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
57             /* matrix operations can be performed only on rank two or less tensors */
58             CV_Assert(get_effective_rank(A) <= 2 &&
59                 get_effective_rank(B) <= 2 &&
60                 get_effective_rank(result) <= 2);
61
62             /* check dimension requirements for matrix multiplication */
63             if (!transa && !transb) {
64                 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
65                 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
66                 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
67             } else if (!transa && transb) {
68                 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
69                 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
70                 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
71             } else if (transa && !transb) {
72                 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
73                 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
74                 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
75             } else {
76                 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
77                 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
78                 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
79             }
80
81             const auto result_nr = result.get_axis_size(-2);
82             const auto result_nc = result.get_axis_size(-1);
83             const auto common_dim = A.get_axis_size(transa ? -2 : -1);
84             const auto A_nc = A.get_axis_size(-1);
85             const auto B_nc = B.get_axis_size(-1);
86
87             /* tensors are stored in row-major but cublas::gemm operates on column-major matrices
88              * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
89              *
90              * Required: C = AB
91              * what cuBLAS sees: C^T = A^TB^T = (BA)^T
92              *
93              * By reversing operands, we effectively perform:
94              * C^T = B^TA^T = (AB)^T
95              *
96              * which gives C = AB
97              */
98             cublas::gemm<T>(handle,
99                 transb, transa,
100                 result_nc, result_nr, common_dim,
101                 alpha, B.get(), B_nc,
102                 A.get(), A_nc,
103                 beta, result.get(), result_nc);
104         }
105
106         /** @brief performs element-wise addition with broadcasting
107          *
108          * Pre-conditions:
109          * - \p A and \p result must be compatible tensors
110          *
111          * Exception Gaurantee: Basic
112          */
113         template <class T> inline
114         void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
115             CV_Assert(is_shape_same(output, input));
116
117             channel_axis = clamp_axis(channel_axis, input.rank());
118
119             std::size_t outer_size = input.size_range(0, channel_axis);
120             auto channel_size = input.get_axis_size(channel_axis);
121             std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
122
123             std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
124
125             using cudnn::TensorDescriptor;
126             auto inputDesc = TensorDescriptor<T>(shape);
127             auto outputDesc = TensorDescriptor<T>(shape);
128             cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
129         }
130     }
131
132     template <class T>
133     class Convolution {
134         using TensorDescriptor = cudnn::TensorDescriptor<T>;
135         using FilterDescriptor = cudnn::FilterDescriptor<T>;
136         using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
137         using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
138
139     public:
140         struct params_type {
141             std::vector<std::size_t> input_shape;
142             std::vector<std::size_t> filter_shape;
143
144             std::vector<std::size_t> padding;
145             std::vector<std::size_t> stride;
146             std::vector<std::size_t> dilation;
147
148             std::size_t groups;
149         };
150
151         Convolution() = default;
152         Convolution(const Convolution&) = delete;
153         Convolution(Convolution&&) = default;
154         Convolution(cudnn::Handle handle, const params_type& params) {
155             cudnnHandle = std::move(handle);
156
157             inputTensorDesc = TensorDescriptor(params.input_shape);
158             filterDesc = FilterDescriptor(params.filter_shape);
159             convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
160
161             std::vector<int> output_dims;
162             getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
163             outputTensorDesc = TensorDescriptor(output_dims);
164
165             algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
166         }
167
168         Convolution& operator=(const Convolution&) = delete;
169         Convolution& operator=(Convolution&&) = default;
170
171         std::size_t get_workspace_size() const noexcept {
172             return algo.get_workspace_size();
173         }
174
175         void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
176             cudnn::convolve<T>(
177                 cudnnHandle,
178                 convDesc, algo, scratchpad,
179                 filterDesc, filters.get(),
180                 inputTensorDesc, input.get(),
181                 1.0, 0.0, outputTensorDesc, output.get()
182             );
183         }
184
185     private:
186         cudnn::Handle cudnnHandle;
187         TensorDescriptor inputTensorDesc, outputTensorDesc;
188         FilterDescriptor filterDesc;
189         ConvolutionDescriptor convDesc;
190         ConvolutionAlgorithm algo;
191     };
192
193     template <class T>
194     class TransposeConvolution {
195         using TensorDescriptor = cudnn::TensorDescriptor<T>;
196         using FilterDescriptor = cudnn::FilterDescriptor<T>;
197         using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
198         using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
199
200     public:
201         struct params_type {
202             std::vector<std::size_t> input_shape;
203             std::vector<std::size_t> output_shape;
204
205             std::vector<std::size_t> filter_shape;
206
207             std::vector<std::size_t> padding;
208             std::vector<std::size_t> stride;
209             std::vector<std::size_t> dilation;
210
211             std::size_t groups;
212         };
213
214         TransposeConvolution() = default;
215         TransposeConvolution(const TransposeConvolution&) = delete;
216         TransposeConvolution(TransposeConvolution&&) = default;
217         TransposeConvolution(cudnn::Handle handle, const params_type& params) {
218             cudnnHandle = std::move(handle);
219
220             filterDesc = FilterDescriptor(params.filter_shape);
221             convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
222
223             /* input_shape is the output shape for convolution
224              * output_shape is the input shape for convolution
225              */
226             convInputTensorDesc = TensorDescriptor(params.output_shape);
227
228             std::vector<int> conv_output_dims;
229             getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
230
231             /* the convolution output must be identical to what cuDNN expects */
232             CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
233
234             convOutputTensorDesc = TensorDescriptor(params.input_shape);
235
236             algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
237         }
238
239         TransposeConvolution& operator=(const TransposeConvolution&) = delete;
240         TransposeConvolution& operator=(TransposeConvolution&&) = default;
241
242         std::size_t get_workspace_size() const noexcept {
243             return algo.get_workspace_size();
244         }
245
246         void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
247             cudnn::transpose_convolve<T>(
248                 cudnnHandle,
249                 convDesc, algo, scratchpad,
250                 filterDesc, filters.get(),
251                 convOutputTensorDesc, input.get(),
252                 1.0, 0.0, convInputTensorDesc, output.get()
253             );
254         }
255
256     private:
257         cudnn::Handle cudnnHandle;
258         TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
259         FilterDescriptor filterDesc;
260         ConvolutionDescriptor convDesc;
261         TransposeConvolutionAlgorithm algo;
262     };
263
264     template <class T>
265     class Pooling {
266         using TensorDescriptor = cudnn::TensorDescriptor<T>;
267         using PoolingDescriptor = cudnn::PoolingDescriptor;
268
269     public:
270         using PoolingType = PoolingDescriptor::PoolingType;
271
272         struct params_type {
273             std::vector<std::size_t> input_shape;
274             std::vector<std::size_t> output_shape;
275
276             std::vector<std::size_t> window_size;
277             std::vector<std::size_t> padding;
278             std::vector<std::size_t> stride;
279
280             PoolingType type;
281         };
282
283         Pooling() = default;
284         Pooling(const Pooling&) = delete;
285         Pooling(Pooling&&) = default;
286         Pooling(cudnn::Handle handle, const params_type& params) {
287             cudnnHandle = std::move(handle);
288
289             inputTensorDesc = TensorDescriptor(params.input_shape);
290             poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
291
292             //std::vector<int> output_dim;
293             //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
294             outputTensorDesc = TensorDescriptor(params.output_shape);
295         }
296
297         Pooling& operator=(const Pooling&) = delete;
298         Pooling& operator=(Pooling&&) = default;
299
300         void pool(TensorView<T> input, TensorSpan<T> output) {
301             cudnn::pool<T>(
302                 cudnnHandle,
303                 poolingDesc,
304                 inputTensorDesc, input.get(),
305                 1.0, 0.0, outputTensorDesc, output.get()
306             );
307         }
308
309     private:
310         cudnn::Handle cudnnHandle;
311         TensorDescriptor inputTensorDesc, outputTensorDesc;
312         PoolingDescriptor poolingDesc;
313     };
314
315     template <class T>
316     class LRN {
317         using LRNDescriptor = cudnn::LRNDescriptor;
318         using TensorDescriptor = cudnn::TensorDescriptor<T>;
319
320     public:
321         using LRNType = LRNDescriptor::LRNType;
322
323         LRN() = default;
324         LRN(const LRN&) = delete;
325         LRN(LRN&&) = default;
326         LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
327             cudnnHandle = std::move(handle);
328             lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
329         }
330
331         LRN& operator=(const LRN&) = delete;
332         LRN& operator=(LRN&&) = default;
333
334         void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
335             cudnn::LRNForward<T>(
336                 cudnnHandle,
337                 lrnDesc,
338                 TensorDescriptor(input.shape_as_vector()), input.get(),
339                 1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
340                 workspace
341             );
342         }
343
344     private:
345         cudnn::Handle cudnnHandle;
346         LRNDescriptor lrnDesc;
347     };
348
349     template <class T>
350     class TensorTransform {
351         using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
352         using TensorDescriptor = cudnn::TensorDescriptor<T>;
353
354     public:
355         TensorTransform() = default;
356         TensorTransform(const TensorTransform&) = delete;
357         TensorTransform(TensorTransform&&) = default;
358
359         template <class SequenceContainer>
360         TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
361             cudnnHandle = std::move(handle);
362             transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
363         }
364
365         TensorTransform& operator=(const TensorTransform&) = delete;
366         TensorTransform& operator=(TensorTransform&&) = default;
367
368         void transform(TensorView<T> input, TensorSpan<T> output) {
369             cudnn::transform<T>(
370                 cudnnHandle,
371                 transDesc,
372                 TensorDescriptor(input.shape_as_vector()), input.get(),
373                 TensorDescriptor(output.shape_as_vector()), output.get()
374             );
375         }
376
377     private:
378         cudnn::Handle cudnnHandle;
379         TensorTransformDescriptor transDesc;
380     };
381
382 }}}} /* namespace cv::dnn::cuda4dnn::csl */
383
384 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */