1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
10 #include "pointer.hpp"
13 #include "workspace.hpp"
15 #include "cudnn/convolution.hpp"
16 #include "cudnn/pooling.hpp"
17 #include "cudnn/lrn.hpp"
18 #include "cudnn/softmax.hpp"
19 #include "cudnn/transform.hpp"
20 #include "cudnn/transpose_convolution.hpp"
22 #include <opencv2/core.hpp>
29 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
31 namespace tensor_ops {
33 /** @brief copies data between tensors
36 * - \p dest and \p src must have the same shape
38 * Exception Gaurantee: Basic
40 template <class T> inline
41 void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
42 CV_Assert(is_shape_same(dest, src));
43 if (dest.get() != src.get())
44 memcpy(dest.get(), src.get(), dest.size(), stream);
47 /** @brief performs generalized matrix-multiplication
50 * - \p A and \p B must meet the mathematical requirements for matrix multiplication
51 * - \p result must be large enough to hold the result
53 * Exception Gaurantee: Basic
55 template <class T> inline
56 void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
57 /* matrix operations can be performed only on rank two or less tensors */
58 CV_Assert(get_effective_rank(A) <= 2 &&
59 get_effective_rank(B) <= 2 &&
60 get_effective_rank(result) <= 2);
62 /* check dimension requirements for matrix multiplication */
63 if (!transa && !transb) {
64 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
65 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
66 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
67 } else if (!transa && transb) {
68 CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
69 CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
70 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
71 } else if (transa && !transb) {
72 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
73 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
74 CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
76 CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
77 CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
78 CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
81 const auto result_nr = result.get_axis_size(-2);
82 const auto result_nc = result.get_axis_size(-1);
83 const auto common_dim = A.get_axis_size(transa ? -2 : -1);
84 const auto A_nc = A.get_axis_size(-1);
85 const auto B_nc = B.get_axis_size(-1);
87 /* tensors are stored in row-major but cublas::gemm operates on column-major matrices
88 * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
91 * what cuBLAS sees: C^T = A^TB^T = (BA)^T
93 * By reversing operands, we effectively perform:
94 * C^T = B^TA^T = (AB)^T
98 cublas::gemm<T>(handle,
100 result_nc, result_nr, common_dim,
101 alpha, B.get(), B_nc,
103 beta, result.get(), result_nc);
106 /** @brief performs element-wise addition with broadcasting
109 * - \p A and \p result must be compatible tensors
111 * Exception Gaurantee: Basic
113 template <class T> inline
114 void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
115 CV_Assert(is_shape_same(output, input));
117 channel_axis = clamp_axis(channel_axis, input.rank());
119 std::size_t outer_size = input.size_range(0, channel_axis);
120 auto channel_size = input.get_axis_size(channel_axis);
121 std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
123 std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
125 using cudnn::TensorDescriptor;
126 auto inputDesc = TensorDescriptor<T>(shape);
127 auto outputDesc = TensorDescriptor<T>(shape);
128 cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
134 using TensorDescriptor = cudnn::TensorDescriptor<T>;
135 using FilterDescriptor = cudnn::FilterDescriptor<T>;
136 using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
137 using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
141 std::vector<std::size_t> input_shape;
142 std::vector<std::size_t> filter_shape;
144 std::vector<std::size_t> padding;
145 std::vector<std::size_t> stride;
146 std::vector<std::size_t> dilation;
151 Convolution() = default;
152 Convolution(const Convolution&) = delete;
153 Convolution(Convolution&&) = default;
154 Convolution(cudnn::Handle handle, const params_type& params) {
155 cudnnHandle = std::move(handle);
157 inputTensorDesc = TensorDescriptor(params.input_shape);
158 filterDesc = FilterDescriptor(params.filter_shape);
159 convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
161 std::vector<int> output_dims;
162 getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
163 outputTensorDesc = TensorDescriptor(output_dims);
165 algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
168 Convolution& operator=(const Convolution&) = delete;
169 Convolution& operator=(Convolution&&) = default;
171 std::size_t get_workspace_size() const noexcept {
172 return algo.get_workspace_size();
175 void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
178 convDesc, algo, scratchpad,
179 filterDesc, filters.get(),
180 inputTensorDesc, input.get(),
181 1.0, 0.0, outputTensorDesc, output.get()
186 cudnn::Handle cudnnHandle;
187 TensorDescriptor inputTensorDesc, outputTensorDesc;
188 FilterDescriptor filterDesc;
189 ConvolutionDescriptor convDesc;
190 ConvolutionAlgorithm algo;
194 class TransposeConvolution {
195 using TensorDescriptor = cudnn::TensorDescriptor<T>;
196 using FilterDescriptor = cudnn::FilterDescriptor<T>;
197 using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
198 using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
202 std::vector<std::size_t> input_shape;
203 std::vector<std::size_t> output_shape;
205 std::vector<std::size_t> filter_shape;
207 std::vector<std::size_t> padding;
208 std::vector<std::size_t> stride;
209 std::vector<std::size_t> dilation;
214 TransposeConvolution() = default;
215 TransposeConvolution(const TransposeConvolution&) = delete;
216 TransposeConvolution(TransposeConvolution&&) = default;
217 TransposeConvolution(cudnn::Handle handle, const params_type& params) {
218 cudnnHandle = std::move(handle);
220 filterDesc = FilterDescriptor(params.filter_shape);
221 convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
223 /* input_shape is the output shape for convolution
224 * output_shape is the input shape for convolution
226 convInputTensorDesc = TensorDescriptor(params.output_shape);
228 std::vector<int> conv_output_dims;
229 getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
231 /* the convolution output must be identical to what cuDNN expects */
232 CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
234 convOutputTensorDesc = TensorDescriptor(params.input_shape);
236 algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
239 TransposeConvolution& operator=(const TransposeConvolution&) = delete;
240 TransposeConvolution& operator=(TransposeConvolution&&) = default;
242 std::size_t get_workspace_size() const noexcept {
243 return algo.get_workspace_size();
246 void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
247 cudnn::transpose_convolve<T>(
249 convDesc, algo, scratchpad,
250 filterDesc, filters.get(),
251 convOutputTensorDesc, input.get(),
252 1.0, 0.0, convInputTensorDesc, output.get()
257 cudnn::Handle cudnnHandle;
258 TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
259 FilterDescriptor filterDesc;
260 ConvolutionDescriptor convDesc;
261 TransposeConvolutionAlgorithm algo;
266 using TensorDescriptor = cudnn::TensorDescriptor<T>;
267 using PoolingDescriptor = cudnn::PoolingDescriptor;
270 using PoolingType = PoolingDescriptor::PoolingType;
273 std::vector<std::size_t> input_shape;
274 std::vector<std::size_t> output_shape;
276 std::vector<std::size_t> window_size;
277 std::vector<std::size_t> padding;
278 std::vector<std::size_t> stride;
284 Pooling(const Pooling&) = delete;
285 Pooling(Pooling&&) = default;
286 Pooling(cudnn::Handle handle, const params_type& params) {
287 cudnnHandle = std::move(handle);
289 inputTensorDesc = TensorDescriptor(params.input_shape);
290 poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
292 //std::vector<int> output_dim;
293 //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
294 outputTensorDesc = TensorDescriptor(params.output_shape);
297 Pooling& operator=(const Pooling&) = delete;
298 Pooling& operator=(Pooling&&) = default;
300 void pool(TensorView<T> input, TensorSpan<T> output) {
304 inputTensorDesc, input.get(),
305 1.0, 0.0, outputTensorDesc, output.get()
310 cudnn::Handle cudnnHandle;
311 TensorDescriptor inputTensorDesc, outputTensorDesc;
312 PoolingDescriptor poolingDesc;
317 using LRNDescriptor = cudnn::LRNDescriptor;
318 using TensorDescriptor = cudnn::TensorDescriptor<T>;
321 using LRNType = LRNDescriptor::LRNType;
324 LRN(const LRN&) = delete;
325 LRN(LRN&&) = default;
326 LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
327 cudnnHandle = std::move(handle);
328 lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
331 LRN& operator=(const LRN&) = delete;
332 LRN& operator=(LRN&&) = default;
334 void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
335 cudnn::LRNForward<T>(
338 TensorDescriptor(input.shape_as_vector()), input.get(),
339 1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
345 cudnn::Handle cudnnHandle;
346 LRNDescriptor lrnDesc;
350 class TensorTransform {
351 using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
352 using TensorDescriptor = cudnn::TensorDescriptor<T>;
355 TensorTransform() = default;
356 TensorTransform(const TensorTransform&) = delete;
357 TensorTransform(TensorTransform&&) = default;
359 template <class SequenceContainer>
360 TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
361 cudnnHandle = std::move(handle);
362 transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
365 TensorTransform& operator=(const TensorTransform&) = delete;
366 TensorTransform& operator=(TensorTransform&&) = default;
368 void transform(TensorView<T> input, TensorSpan<T> output) {
372 TensorDescriptor(input.shape_as_vector()), input.get(),
373 TensorDescriptor(output.shape_as_vector()), output.get()
378 cudnn::Handle cudnnHandle;
379 TensorTransformDescriptor transDesc;
382 }}}} /* namespace cv::dnn::cuda4dnn::csl */
384 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */