modules/dnn/src/cuda/eltwise_ops.cu

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #include <cuda_runtime.h>
   6 #include <cuda_fp16.h>
   7
   8 #include "math.hpp"
   9 #include "grid_stride_range.hpp"
  10 #include "execution.hpp"
  11 #include "vector_traits.hpp"
  12
  13 #include "../cuda4dnn/csl/stream.hpp"
  14 #include "../cuda4dnn/csl/span.hpp"
  15
  16 #include <opencv2/core.hpp>
  17
  18 using namespace cv::dnn::cuda4dnn::csl;
  19 using namespace cv::dnn::cuda4dnn::csl::device;
  20
  21 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  22
  23     namespace raw {
  24         template <class T, std::size_t N>
  25         __global__ void eltwise_max_2_vec(Span<T> output, View<T> x, View<T> y) {
  26             using vector_type = get_vector_type_t<T, N>;
  27
  28             auto output_vPtr = vector_type::get_pointer(output.data());
  29             auto x_vPtr = vector_type::get_pointer(x.data());
  30             auto y_vPtr = vector_type::get_pointer(y.data());
  31
  32             for (auto i : grid_stride_range(output.size() / vector_type::size())) {
  33                 vector_type vec_x, vec_y;
  34                 v_load(vec_x, x_vPtr[i]);
  35                 v_load(vec_y, y_vPtr[i]);
  36
  37                 for (int j = 0; j < vector_type::size(); j++) {
  38                     using device::max;
  39                     vec_x.data[j] = max(vec_x.data[j], vec_y.data[j]);
  40                 }
  41
  42                 v_store(output_vPtr[i], vec_x);
  43             }
  44         }
  45
  46         template <class T, std::size_t N>
  47         __global__ void eltwise_sum_2_vec(Span<T> output, View<T> x, View<T> y) {
  48             using vector_type = get_vector_type_t<T, N>;
  49
  50             auto output_vPtr = vector_type::get_pointer(output.data());
  51             auto x_vPtr = vector_type::get_pointer(x.data());
  52             auto y_vPtr = vector_type::get_pointer(y.data());
  53
  54             for (auto i : grid_stride_range(output.size() / vector_type::size())) {
  55                 vector_type vec_x, vec_y;
  56                 v_load(vec_x, x_vPtr[i]);
  57                 v_load(vec_y, y_vPtr[i]);
  58
  59                 for (int j = 0; j < vector_type::size(); j++)
  60                     vec_x.data[j] = vec_x.data[j] + vec_y.data[j];
  61
  62                 v_store(output_vPtr[i], vec_x);
  63             }
  64         }
  65
  66         template <class T, std::size_t N>
  67         __global__ void eltwise_sum_coeff_2_vec(Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
  68             using vector_type = get_vector_type_t<T, N>;
  69
  70             auto output_vPtr = vector_type::get_pointer(output.data());
  71             auto x_vPtr = vector_type::get_pointer(x.data());
  72             auto y_vPtr = vector_type::get_pointer(y.data());
  73
  74             for (auto i : grid_stride_range(output.size() / vector_type::size())) {
  75                 vector_type vec_x, vec_y;
  76                 v_load(vec_x, x_vPtr[i]);
  77                 v_load(vec_y, y_vPtr[i]);
  78
  79                 for (int j = 0; j < vector_type::size(); j++)
  80                     vec_x.data[j] = coeff_x * vec_x.data[j] + coeff_y * vec_y.data[j];
  81
  82                 v_store(output_vPtr[i], vec_x);
  83             }
  84         }
  85
  86         template <class T, std::size_t N>
  87         __global__ void eltwise_prod_2_vec(Span<T> output, View<T> x, View<T> y) {
  88             using vector_type = get_vector_type_t<T, N>;
  89
  90             auto output_vPtr = vector_type::get_pointer(output.data());
  91             auto x_vPtr = vector_type::get_pointer(x.data());
  92             auto y_vPtr = vector_type::get_pointer(y.data());
  93
  94             for (auto i : grid_stride_range(output.size() / vector_type::size())) {
  95                 vector_type vec_x, vec_y;
  96                 v_load(vec_x, x_vPtr[i]);
  97                 v_load(vec_y, y_vPtr[i]);
  98
  99                 for (int j = 0; j < vector_type::size(); j++)
 100                     vec_x.data[j] = vec_x.data[j] * vec_y.data[j];
 101
 102                 v_store(output_vPtr[i], vec_x);
 103             }
 104         }
 105     }
 106
 107     template <class T, std::size_t N>
 108     void launch_vectorized_eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 109         CV_Assert(is_fully_aligned<T>(output, N));
 110         CV_Assert(is_fully_aligned<T>(x, N));
 111         CV_Assert(is_fully_aligned<T>(y, N));
 112
 113         auto kernel = raw::eltwise_max_2_vec<T, N>;
 114         auto policy = make_policy(kernel, output.size() / N, 0, stream);
 115         launch_kernel(kernel, policy, output, x, y);
 116     }
 117
 118     template <class T>
 119     void eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 120         CV_Assert(x.size() == y.size());
 121         CV_Assert(x.size() == output.size());
 122
 123         if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
 124             launch_vectorized_eltwise_max_2<T, 4>(stream, output, x, y);
 125         } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
 126             launch_vectorized_eltwise_max_2<T, 2>(stream, output, x, y);
 127         } else {
 128             launch_vectorized_eltwise_max_2<T, 1>(stream, output, x, y);
 129         }
 130     }
 131
 132     template void eltwise_max_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
 133     template void eltwise_max_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
 134
 135     template <class T, std::size_t N>
 136     void launch_vectorized_eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 137         CV_Assert(is_fully_aligned<T>(output, N));
 138         CV_Assert(is_fully_aligned<T>(x, N));
 139         CV_Assert(is_fully_aligned<T>(y, N));
 140
 141         auto kernel = raw::eltwise_sum_2_vec<T, N>;
 142         auto policy = make_policy(kernel, output.size() / N, 0, stream);
 143         launch_kernel(kernel, policy, output, x, y);
 144     }
 145
 146     template <class T>
 147     void eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 148         CV_Assert(x.size() == y.size());
 149         CV_Assert(x.size() == output.size());
 150
 151         if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
 152             launch_vectorized_eltwise_sum_2<T, 4>(stream, output, x, y);
 153         } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
 154             launch_vectorized_eltwise_sum_2<T, 2>(stream, output, x, y);
 155         } else {
 156             launch_vectorized_eltwise_sum_2<T, 1>(stream, output, x, y);
 157         }
 158     }
 159
 160     template void eltwise_sum_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
 161     template void eltwise_sum_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
 162
 163     template <class T, std::size_t N>
 164     void launch_vectorized_eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
 165         CV_Assert(is_fully_aligned<T>(output, N));
 166         CV_Assert(is_fully_aligned<T>(x, N));
 167         CV_Assert(is_fully_aligned<T>(y, N));
 168
 169         auto kernel = raw::eltwise_sum_coeff_2_vec<T, N>;
 170         auto policy = make_policy(kernel, output.size() / N, 0, stream);
 171         launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y);
 172     }
 173
 174     template <class T>
 175     void eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
 176         CV_Assert(x.size() == y.size());
 177         CV_Assert(x.size() == output.size());
 178
 179         if (static_cast<float>(coeff_x) == 1.0f && static_cast<float>(coeff_y) == 1.0f) {
 180             eltwise_sum_2(stream, output, x, y);
 181             return;
 182         }
 183
 184         if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
 185             launch_vectorized_eltwise_sum_coeff_2<T, 4>(stream, output, coeff_x, x, coeff_y, y);
 186         } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
 187             launch_vectorized_eltwise_sum_coeff_2<T, 2>(stream, output, coeff_x, x, coeff_y, y);
 188         } else {
 189             launch_vectorized_eltwise_sum_coeff_2<T, 1>(stream, output, coeff_x, x, coeff_y, y);
 190         }
 191     }
 192
 193     template void eltwise_sum_coeff_2(const Stream&, Span<__half>, __half, View<__half>, __half, View<__half>);
 194     template void eltwise_sum_coeff_2(const Stream&, Span<float>, float, View<float>, float, View<float>);
 195
 196     template <class T, std::size_t N>
 197     void launch_vectorized_eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 198         CV_Assert(is_fully_aligned<T>(output, N));
 199         CV_Assert(is_fully_aligned<T>(x, N));
 200         CV_Assert(is_fully_aligned<T>(y, N));
 201
 202         auto kernel = raw::eltwise_prod_2_vec<T, N>;
 203         auto policy = make_policy(kernel, output.size() / N, 0, stream);
 204         launch_kernel(kernel, policy, output, x, y);
 205     }
 206
 207     template <class T>
 208     void eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
 209         CV_Assert(x.size() == y.size());
 210         CV_Assert(x.size() == output.size());
 211
 212         if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
 213             launch_vectorized_eltwise_prod_2<T, 4>(stream, output, x, y);
 214         } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
 215             launch_vectorized_eltwise_prod_2<T, 2>(stream, output, x, y);
 216         } else {
 217             launch_vectorized_eltwise_prod_2<T, 1>(stream, output, x, y);
 218         }
 219     }
 220
 221     template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
 222     template void eltwise_prod_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
 223
 224 }}}} /* namespace cv::dnn::cuda4dnn::kernels */