modules/dnn/src/cuda/max_unpooling.cu

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #include <cuda_runtime.h>
   6 #include <cuda_fp16.h>
   7
   8 #include "math.hpp"
   9 #include "array.hpp"
  10 #include "limits.hpp"
  11 #include "types.hpp"
  12 #include "grid_stride_range.hpp"
  13 #include "execution.hpp"
  14
  15 #include "../cuda4dnn/csl/stream.hpp"
  16 #include "../cuda4dnn/csl/tensor.hpp"
  17 #include "../cuda4dnn/csl/span.hpp"
  18
  19 #include "../cuda4dnn/kernels/fill.hpp"
  20
  21 #include <opencv2/core.hpp>
  22
  23 #include <cstddef>
  24 #include <vector>
  25 #include <type_traits>
  26
  27 using namespace cv::dnn::cuda4dnn::csl;
  28 using namespace cv::dnn::cuda4dnn::csl::device;
  29
  30 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  31
  32     namespace raw {
  33         template <class T, std::size_t Order,
  34         typename std::enable_if<Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
  35         __global__ void max_pooling_with_indices(
  36             Span<T> output, Span<T> indices, View<T> input, size_type channels,
  37             array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
  38             array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
  39         {
  40             /* every element in the output is mapped to a window in the input and each thread processes several windows */
  41             for (auto idx : grid_stride_range(output.size())) {
  42                 size_type out_spatial_size = 1;
  43                 array<index_type, Order> window_idx;
  44                 for (int i = Order - 1; i >= 0; i--) {
  45                     window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
  46                     out_spatial_size *= out_spatial_dims[i];
  47                 }
  48
  49                 const index_type n = idx / (out_spatial_size * channels);
  50                 const index_type c = (idx / out_spatial_size) % channels;
  51
  52                 array<index_type, Order> start;
  53                 for(int i = 0; i < Order; i++)
  54                     start[i] = window_idx[i] * strides[i] - padding_left[i];
  55
  56                 array<index_type, Order> end;
  57                 for (int i = 0; i < Order; i++) {
  58                     using device::min;
  59                     end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
  60                 }
  61
  62                 for (int i = 0; i < Order; i++) {
  63                     using device::max;
  64                     start[i] = max(start[i], 0);
  65                 }
  66
  67                 T max_value = numeric_limits<T>::lowest();
  68                 index_type max_idx = -1;
  69
  70                 size_type in_spatial_size = 1;
  71                 for (int i = 0; i < Order; i++)
  72                     in_spatial_size *= in_spatial_dims[i];
  73
  74                 const auto outer_offset =  (n * channels + c) * in_spatial_size;
  75                 if (Order == 2) {
  76                     array<index_type, Order> idx;
  77                     for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
  78                         for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
  79                             index_type offset = 0;
  80                             index_type stride = 1;
  81                             for (int i = Order - 1; i >= 0; i--) {
  82                                 offset += stride * idx[i];
  83                                 stride *= in_spatial_dims[i];
  84                             }
  85
  86                             if (input[outer_offset + offset] > max_value) {
  87                                 max_idx = offset;
  88                                 max_value = input[outer_offset + offset];
  89                             }
  90                         }
  91                     }
  92                 } else if(Order == 3) {
  93                     array<index_type, Order> idx;
  94                     for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
  95                         for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
  96                             for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
  97                                 index_type offset = 0;
  98                                 index_type stride = 1;
  99                                 for (int i = Order - 1; i >= 0; i--) {
 100                                     offset += stride * idx[i];
 101                                     stride *= in_spatial_dims[i];
 102                                 }
 103
 104                                 if (input[outer_offset + offset] > max_value) {
 105                                     max_idx = offset;
 106                                     max_value = input[outer_offset + offset];
 107                                 }
 108                             }
 109                         }
 110                     }
 111                 }
 112
 113                 output[idx] = max_value;
 114                 indices[idx] = max_idx;
 115             }
 116         }
 117
 118         template <class T, std::size_t Order>
 119         __global__ void max_unpooling(
 120             Span<T> output, View<T> input, View<T> indices, size_type channels,
 121             array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
 122             array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
 123         {
 124             /* the output has already been zero filled */
 125             /* Every input value represents a window in the output. The max unpooling operation
 126              * copies the input value to exactly one location in the output window which is given
 127              * by the indices tensor.
 128              */
 129             for (auto idx : grid_stride_range(input.size())) {
 130                 size_type in_spatial_size = 1;
 131                 array<index_type, Order> window_idx;
 132                 for (int i = Order - 1; i >= 0; i--) {
 133                     window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
 134                     in_spatial_size *= in_spatial_dims[i];
 135                 }
 136
 137                 const index_type n = idx / (in_spatial_size * channels);
 138                 const index_type c = (idx / in_spatial_size) % channels;
 139
 140                 array<index_type, Order> start;
 141                 for (int i = 0; i < Order; i++) {
 142                     using device::min;
 143                     using device::max;
 144                     start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
 145                 }
 146
 147                 size_type out_spatial_size = 1;
 148                 for (int i = 0; i < Order; i++)
 149                     out_spatial_size *= out_spatial_dims[i];
 150
 151                 index_type outer_offset = (n * channels + c) * out_spatial_size;
 152                 output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
 153             }
 154         }
 155     }
 156
 157     template <class T, std::size_t Order> static
 158     void launch_max_pooling_kernel(
 159         const Stream& stream,
 160         Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
 161         const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
 162         const std::vector<std::size_t>& window_size,
 163         const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
 164     {
 165         CV_Assert(indices.size() == output.size());
 166         CV_Assert(out_spatial_dims.size() == Order);
 167         CV_Assert(in_spatial_dims.size() == Order);
 168         CV_Assert(window_size.size() == Order);
 169         CV_Assert(strides.size() == Order);
 170         CV_Assert(padding_left.size() == Order);
 171
 172         array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
 173         out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
 174         in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
 175
 176         array<size_type, Order> window_size_k, strides_k, padding_left_k;
 177         window_size_k.assign(std::begin(window_size), std::end(window_size));
 178         strides_k.assign(std::begin(strides), std::end(strides));
 179         padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
 180
 181         auto kernel = raw::max_pooling_with_indices<T, Order>;
 182         auto policy = make_policy(kernel, output.size(), 0, stream);
 183         launch_kernel(kernel, policy, output, indices, input, channels,
 184             out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
 185     }
 186
 187     template <class T>
 188     void max_pooling_with_indices(
 189         const Stream& stream,
 190         TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
 191         const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
 192         const std::vector<std::size_t>& padding_left)
 193     {
 194         CV_Assert(is_shape_same(output, indices));
 195         CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
 196
 197         auto order = window_size.size();
 198         CV_Assert(strides.size() == order);
 199         CV_Assert(padding_left.size() == order);
 200         CV_Assert(output.rank() == order + 2);
 201         CV_Assert(input.rank() == order + 2);
 202
 203         std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
 204         for (int i = 0; i < order; i++) {
 205             in_spatial_dims[i] = input.get_axis_size(2 + i);
 206             out_spatial_dims[i] = output.get_axis_size(2 + i);
 207         }
 208
 209         /* only max_pooling2d and max_pooling3d are supported */
 210         CV_Assert(2 <= order && order <= 3);
 211         std::size_t channels = input.get_axis_size(1);
 212         if (order == 3) {
 213             launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
 214                 out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
 215         } else if (order == 2) {
 216             launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
 217                 out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
 218         }
 219     }
 220
 221     template void max_pooling_with_indices(const Stream&,
 222         TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
 223         const std::vector<std::size_t>&, const std::vector<std::size_t>&,
 224         const std::vector<std::size_t>&);
 225
 226     template void max_pooling_with_indices(const Stream&,
 227         TensorSpan<float>, TensorSpan<float>, TensorView<float>,
 228         const std::vector<std::size_t>&, const std::vector<std::size_t>&,
 229         const std::vector<std::size_t>&);
 230
 231     template <class T, std::size_t Order> static
 232     void launch_max_unpooling_kernel(
 233         const Stream& stream,
 234         Span<T> output, View<T> input, View<T> indices, std::size_t channels,
 235         const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
 236         const std::vector<std::size_t>& window_size,
 237         const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
 238     {
 239         CV_Assert(out_spatial_dims.size() == Order);
 240         CV_Assert(in_spatial_dims.size() == Order);
 241         CV_Assert(window_size.size() == Order);
 242         CV_Assert(strides.size() == Order);
 243         CV_Assert(padding_left.size() == Order);
 244         CV_Assert(indices.size() == input.size());
 245
 246         array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
 247         out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
 248         in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
 249
 250         array<size_type, Order> window_size_k, strides_k, padding_left_k;
 251         window_size_k.assign(std::begin(window_size), std::end(window_size));
 252         strides_k.assign(std::begin(strides), std::end(strides));
 253         padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
 254
 255         auto kernel = raw::max_unpooling<T, Order>;
 256         auto policy = make_policy(kernel, input.size(), 0, stream);
 257         launch_kernel(kernel, policy, output, input, indices, channels,
 258             out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
 259     }
 260
 261     template <class T>
 262     void max_unpooling(
 263         const Stream& stream,
 264         TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
 265         const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
 266         const std::vector<std::size_t>& padding_left)
 267     {
 268         CV_Assert(is_shape_same(input, indices));
 269         CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
 270
 271         auto order = window_size.size();
 272         CV_Assert(strides.size() == order);
 273         CV_Assert(padding_left.size() == order);
 274         CV_Assert(output.rank() == order + 2);
 275         CV_Assert(input.rank() == order + 2);
 276
 277         std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
 278         for (int i = 0; i < order; i++) {
 279             in_spatial_dims[i] = input.get_axis_size(2 + i);
 280             out_spatial_dims[i] = output.get_axis_size(2 + i);
 281         }
 282
 283         kernels::fill<T>(stream, output, 0.0);
 284
 285         /* only max_unpooling2d and max_unpooling3d are supported */
 286         CV_Assert(2 <= order && order <= 3);
 287         std::size_t channels = input.get_axis_size(1);
 288         if (order == 3) {
 289             launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
 290                 out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
 291         } else if (order == 2) {
 292             launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
 293                 out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
 294         }
 295     }
 296
 297     template void max_unpooling(const Stream&,
 298         TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
 299         const std::vector<std::size_t>&, const std::vector<std::size_t>&,
 300         const std::vector<std::size_t>&);
 301
 302     template void max_unpooling(const Stream&,
 303         TensorSpan<float>, TensorView<float>, TensorView<float>,
 304         const std::vector<std::size_t>&, const std::vector<std::size_t>&,
 305         const std::vector<std::size_t>&);
 306
 307 }}}} /* namespace cv::dnn::cuda4dnn::kernels */