1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #include <cuda_runtime.h>
9 #include "vector_traits.hpp"
10 #include "grid_stride_range.hpp"
11 #include "execution.hpp"
13 #include "../cuda4dnn/csl/stream.hpp"
14 #include "../cuda4dnn/csl/tensor.hpp"
15 #include "../cuda4dnn/csl/span.hpp"
17 #include <opencv2/core.hpp>
21 using namespace cv::dnn::cuda4dnn::csl;
22 using namespace cv::dnn::cuda4dnn::csl::device;
24 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
27 template <class T, std::size_t N>
28 __global__ void bias1_vec(Span<T> output, View<T> input, T beta) {
29 using vector_type = get_vector_type_t<T, N>;
31 auto output_vPtr = vector_type::get_pointer(output.data());
32 auto input_vPtr = vector_type::get_pointer(input.data());
34 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
36 v_load(vec, input_vPtr[i]);
37 for (int j = 0; j < vec.size(); j++)
38 vec.data[j] = vec.data[j] + beta;
39 v_store(output_vPtr[i], vec);
43 template <class T, std::size_t N>
44 __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
45 using vector_type = get_vector_type_t<T, N>;
47 auto output_vPtr = vector_type::get_pointer(output.data());
48 auto input_vPtr = vector_type::get_pointer(input.data());
50 inner_size /= vector_type::size();
51 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
52 const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
55 v_load(vec, input_vPtr[i]);
56 for(int j = 0; j < vec.size(); j++)
57 vec.data[j] = vec.data[j] + bias[bias_idx];
58 v_store(output_vPtr[i], vec);
62 template <class T, std::size_t N>
63 __global__ void scale1_vec(Span<T> output, View<T> input, T alpha) {
64 using vector_type = get_vector_type_t<T, N>;
66 auto output_vPtr = vector_type::get_pointer(output.data());
67 auto input_vPtr = vector_type::get_pointer(input.data());
69 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
71 v_load(vec, input_vPtr[i]);
72 for (int j = 0; j < vec.size(); j++)
73 vec.data[j] = vec.data[j] * alpha;
74 v_store(output_vPtr[i], vec);
78 template <class T, std::size_t N>
79 __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
81 using vector_type = get_vector_type_t<T, N>;
83 auto output_vPtr = vector_type::get_pointer(output.data());
84 auto input_vPtr = vector_type::get_pointer(input.data());
86 inner_size /= vector_type::size();
87 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
88 const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
91 v_load(vec, input_vPtr[i]);
92 for (int j = 0; j < vec.size(); j++)
93 vec.data[j] = vec.data[j] * weights[scale_idx];
94 v_store(output_vPtr[i], vec);
98 template <class T, std::size_t N>
99 __global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
101 using vector_type = get_vector_type_t<T, N>;
103 auto output_vPtr = vector_type::get_pointer(output.data());
104 auto input_vPtr = vector_type::get_pointer(input.data());
106 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
108 v_load(vec, input_vPtr[i]);
109 for (int j = 0; j < vec.size(); j++)
110 vec.data[j] = alpha * vec.data[j] + beta;
111 v_store(output_vPtr[i], vec);
115 template <class T, std::size_t N>
116 __global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
118 using vector_type = get_vector_type_t<T, N>;
120 auto output_vPtr = vector_type::get_pointer(output.data());
121 auto input_vPtr = vector_type::get_pointer(input.data());
123 inner_size /= vector_type::size();
124 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
125 const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
128 v_load(vec, input_vPtr[i]);
129 for (int j = 0; j < vec.size(); j++)
130 vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
131 v_store(output_vPtr[i], vec);
136 template <class T, std::size_t N> static
137 void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) {
138 CV_Assert(is_fully_aligned<T>(output, N));
139 CV_Assert(is_fully_aligned<T>(input, N));
141 auto kernel = raw::bias1_vec<T, N>;
142 auto policy = make_policy(kernel, output.size() / N, 0, stream);
143 launch_kernel(kernel, policy, output, input, beta);
147 void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) {
148 CV_Assert(is_shape_same(input, output));
150 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
151 launch_bias1_vec_kernel<T, 4>(stream, output, input, beta);
152 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
153 launch_bias1_vec_kernel<T, 2>(stream, output, input, beta);
155 launch_bias1_vec_kernel<T, 1>(stream, output, input, beta);
159 template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
160 template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
162 template <class T, std::size_t N> static
163 void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
164 CV_Assert(is_fully_aligned<T>(output, N));
165 CV_Assert(is_fully_aligned<T>(input, N));
166 CV_Assert(inner_size % N == 0);
168 auto kernel = raw::biasN_vec<T, N>;
169 auto policy = make_policy(kernel, output.size() / N, 0, stream);
170 launch_kernel(kernel, policy, output, input, inner_size, bias);
175 const Stream& stream,
176 TensorSpan<T> output,
177 TensorView<T> input, std::size_t inner_size,
180 CV_Assert(is_shape_same(input, output));
182 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
183 launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
184 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
185 launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
187 launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
191 template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
192 template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
194 template <class T, std::size_t N> static
195 void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) {
196 CV_Assert(is_fully_aligned<T>(output, N));
197 CV_Assert(is_fully_aligned<T>(input, N));
199 auto kernel = raw::scale1_vec<T, N>;
200 auto policy = make_policy(kernel, output.size() / N, 0, stream);
201 launch_kernel(kernel, policy, output, input, alpha);
205 void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) {
206 CV_Assert(is_shape_same(input, output));
208 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
209 launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha);
210 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
211 launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha);
213 launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha);
217 template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
218 template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
220 template <class T, std::size_t N> static
221 void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
222 CV_Assert(is_fully_aligned<T>(output, N));
223 CV_Assert(is_fully_aligned<T>(input, N));
224 CV_Assert(inner_size % N == 0);
226 auto kernel = raw::scaleN_vec<T, N>;
227 auto policy = make_policy(kernel, output.size() / N, 0, stream);
228 launch_kernel(kernel, policy, output, input, inner_size, weights);
233 const Stream& stream,
234 TensorSpan<T> output,
235 TensorView<T> input, std::size_t inner_size,
236 TensorView<T> weights)
238 CV_Assert(is_shape_same(input, output));
240 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
241 launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
242 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
243 launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
245 launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
249 template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
250 template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
252 template <class T, std::size_t N> static
253 void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
254 CV_Assert(is_fully_aligned<T>(output, N));
255 CV_Assert(is_fully_aligned<T>(input, N));
257 auto kernel = raw::scale1_with_bias1_vec<T, N>;
258 auto policy = make_policy(kernel, output.size() / N, 0, stream);
259 launch_kernel(kernel, policy, output, input, alpha, beta);
263 void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
264 CV_Assert(output.size() == input.size());
266 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
267 launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
268 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
269 launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
271 launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
275 template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
276 template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
278 template <class T, std::size_t N> static
279 void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
280 CV_Assert(is_fully_aligned<T>(output, N));
281 CV_Assert(is_fully_aligned<T>(input, N));
282 CV_Assert(inner_size % N == 0);
284 auto kernel = raw::scaleN_with_biasN_vec<T, N>;
285 auto policy = make_policy(kernel, output.size() / N, 0, stream);
286 launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
290 void scaleN_with_biasN(
291 const Stream& stream,
292 TensorSpan<T> output,
293 TensorView<T> input, std::size_t inner_size,
294 TensorView<T> weights, TensorView<T> bias)
296 CV_Assert(is_shape_same(input, output));
297 CV_Assert(weights.size() == bias.size());
299 if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
300 launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
301 } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
302 launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
304 launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
308 template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
309 template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
311 }}}} /* namespace cv::dnn::cuda4dnn::kernels */