1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 #include <cuda_runtime.h>
8 #include "grid_stride_range.hpp"
9 #include "execution.hpp"
10 #include "vector_traits.hpp"
12 #include "../cuda4dnn/csl/stream.hpp"
13 #include "../cuda4dnn/csl/span.hpp"
15 using namespace cv::dnn::cuda4dnn::csl;
16 using namespace cv::dnn::cuda4dnn::csl::device;
18 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
21 template <class T, std::size_t N>
22 __global__ void fill_vec(Span<T> output, T value) {
23 using vector_type = get_vector_type_t<T, N>;
25 auto output_vPtr = vector_type::get_pointer(output.data());
26 for (auto i : grid_stride_range(output.size() / vector_type::size())) {
28 for (int j = 0; j < vector_type::size(); j++)
30 v_store(output_vPtr[i], vec);
35 template <class T, std::size_t N>
36 void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
37 CV_Assert(is_fully_aligned<T>(output, N));
39 auto kernel = raw::fill_vec<T, N>;
40 auto policy = make_policy(kernel, output.size() / N, 0, stream);
41 launch_kernel(kernel, policy, output, value);
45 void fill(const Stream& stream, Span<T> output, T value) {
46 if (is_fully_aligned<T>(output, 4)) {
47 launch_vectorized_fill<T, 4>(stream, output, value);
48 } else if (is_fully_aligned<T>(output, 2)) {
49 launch_vectorized_fill<T, 2>(stream, output, value);
51 launch_vectorized_fill<T, 1>(stream, output, value);
55 template void fill(const Stream&, Span<__half>, __half);
56 template void fill(const Stream&, Span<float>, float);
58 }}}} /* namespace cv::dnn::cuda4dnn::kernels */