Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / cuda4dnn / primitives / normalize_bbox.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
7
8 #include "../../op_cuda.hpp"
9
10 #include "../csl/stream.hpp"
11 #include "../csl/span.hpp"
12 #include "../csl/tensor.hpp"
13 #include "../csl/workspace.hpp"
14
15 #include "../kernels/scale_shift.hpp"
16 #include "../kernels/normalize.hpp"
17
18 #include <opencv2/core.hpp>
19
20 #include <cstddef>
21 #include <vector>
22 #include <utility>
23
24 namespace cv { namespace dnn { namespace cuda4dnn {
25
26     template <class T>
27     struct NormalizeConfiguration {
28         std::vector<std::size_t> input_shape;
29
30         /* axis range across which values are normalized
31          *
32          * [0, axis_start) = outer range
33          * [axis_start, axis_end) = mid range
34          * [axis_end + 1, -1) = inner range
35          *
36          * for each location in the outer and inner range, all the values in the mid range are
37          * normalized together
38          */
39         std::size_t axis_start, axis_end;
40
41         /* 1 for L1 norm, 2 for L2 norm */
42         std::size_t norm;
43
44         /* epsilon to use to avoid divison by zero */
45         T eps;
46     };
47
48     template <class T>
49     class NormalizeOp final : public CUDABackendNode {
50     public:
51         using wrapper_type = GetCUDABackendWrapperType<T>;
52
53         template <class V>
54         NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration<V>& config)
55             : stream(std::move(stream_)), weight{ 1.0 }
56         {
57             norm_order = config.norm;
58             epsilon = config.eps;
59             axis_start = config.axis_start;
60             axis_end = config.axis_end;
61
62             if (!weights.empty())
63             {
64                 if (weights.total() == 1)
65                 {
66                     CV_Assert(weights.type() == CV_32F);
67                     weight = weights.at<float>(0, 0);
68                 }
69                 else
70                 {
71                     weightsTensor = csl::makeTensorHeader<T>(weights);
72                     csl::copyMatToTensor<T>(weights, weightsTensor, stream);
73                 }
74             }
75
76             std::size_t outer_size = 1;
77             for (int i = 0; i < axis_start; i++)
78                 outer_size *= config.input_shape[i];
79
80             std::size_t inner_size = 1;
81             for (int i = axis_end; i < config.input_shape.size(); i++)
82                 inner_size *= config.input_shape[i];
83
84             csl::WorkspaceBuilder builder;
85             builder.require<T>(outer_size * inner_size);
86             scratch_mem_in_bytes = builder.required_workspace_size();
87         }
88
89         void forward(
90             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
91             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
92             csl::Workspace& workspace) override
93         {
94             CV_Assert(inputs.size() == 1 && outputs.size() == 1);
95
96             auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
97             auto input = input_wrapper->getView();
98
99             auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
100             auto output = output_wrapper->getSpan();
101
102             std::size_t outer_size = input.size_range(0, axis_start);
103             std::size_t mid_size = input.size_range(axis_start, axis_end);
104             std::size_t inner_size = input.size_range(axis_end, input.rank());
105
106             auto ws_allocator = csl::WorkspaceAllocator(workspace);
107             auto scratch = ws_allocator.get_span<T>();
108             kernels::normalize<T>(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch);
109
110             /* there might be a single weight in which case `weight` will be not equal to 1.0
111              * or there might be several weights
112              * or we don't have to scale
113              */
114             if (weight != 1.0)
115             {
116                 kernels::scale1<T>(stream, output, input, weight);
117             }
118             else if (!weightsTensor.empty())
119             {
120                 CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */
121                 CV_Assert(weightsTensor.size() == mid_size);
122                 kernels::scaleN<T>(stream, output, input, inner_size, weightsTensor);
123             }
124         }
125
126         std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
127
128     private:
129         csl::Stream stream;
130         csl::Tensor<T> weightsTensor;
131         T weight; /* if there is only one weight, we use this */
132
133         T epsilon;
134         std::size_t norm_order;
135         std::size_t axis_start, axis_end;
136
137         std::size_t scratch_mem_in_bytes;
138     };
139
140 }}} /* namespace cv::dnn::cuda4dnn */
141
142 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */