modules/dnn/src/cuda/fill.cu

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #include <cuda_runtime.h>
   6 #include <cuda_fp16.h>
   7
   8 #include "grid_stride_range.hpp"
   9 #include "execution.hpp"
  10 #include "vector_traits.hpp"
  11
  12 #include "../cuda4dnn/csl/stream.hpp"
  13 #include "../cuda4dnn/csl/span.hpp"
  14
  15 using namespace cv::dnn::cuda4dnn::csl;
  16 using namespace cv::dnn::cuda4dnn::csl::device;
  17
  18 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  19
  20     namespace raw {
  21         template <class T, std::size_t N>
  22         __global__ void fill_vec(Span<T> output, T value) {
  23             using vector_type = get_vector_type_t<T, N>;
  24
  25             auto output_vPtr = vector_type::get_pointer(output.data());
  26             for (auto i : grid_stride_range(output.size() / vector_type::size())) {
  27                 vector_type vec;
  28                 for (int j = 0; j < vector_type::size(); j++)
  29                     vec.data[j] = value;
  30                 v_store(output_vPtr[i], vec);
  31             }
  32         }
  33     }
  34
  35     template <class T, std::size_t N>
  36     void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
  37         CV_Assert(is_fully_aligned<T>(output, N));
  38
  39         auto kernel = raw::fill_vec<T, N>;
  40         auto policy = make_policy(kernel, output.size() / N, 0, stream);
  41         launch_kernel(kernel, policy, output, value);
  42     }
  43
  44     template <class T>
  45     void fill(const Stream& stream, Span<T> output, T value) {
  46         if (is_fully_aligned<T>(output, 4)) {
  47             launch_vectorized_fill<T, 4>(stream, output, value);
  48         } else if (is_fully_aligned<T>(output, 2)) {
  49             launch_vectorized_fill<T, 2>(stream, output, value);
  50         } else {
  51             launch_vectorized_fill<T, 1>(stream, output, value);
  52         }
  53     }
  54
  55     template void fill(const Stream&, Span<__half>, __half);
  56     template void fill(const Stream&, Span<float>, float);
  57
  58 }}}} /* namespace cv::dnn::cuda4dnn::kernels */