modules/dnn/src/cuda/resize.cu

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #include <cuda_runtime.h>
   6 #include <cuda_fp16.h>
   7
   8 #include "math.hpp"
   9 #include "types.hpp"
  10 #include "grid_stride_range.hpp"
  11 #include "execution.hpp"
  12
  13 #include "../cuda4dnn/csl/stream.hpp"
  14 #include "../cuda4dnn/csl/tensor.hpp"
  15 #include "../cuda4dnn/csl/span.hpp"
  16
  17 #include <cuda_runtime.h>
  18
  19 using namespace cv::dnn::cuda4dnn::csl;
  20 using namespace cv::dnn::cuda4dnn::csl::device;
  21
  22 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  23
  24     namespace raw {
  25         template <class T>
  26         __global__ void resize_nn(
  27             Span<T> output, size_type out_height, size_type out_width,
  28             View<T> input, size_type in_height, size_type in_width)
  29         {
  30             auto in_image_size = in_height * in_width;
  31             auto out_image_size = out_height * out_width;
  32
  33             /* o2i = output to input */
  34             auto o2i_fx = static_cast<float>(in_width) / out_width;
  35             auto o2i_fy = static_cast<float>(in_height) / out_height;
  36
  37             /* think of the output and input as a collection of 2d images with the last axis
  38              * representing the width and the last but one axis representing the height
  39              *
  40              * the remaining axis together form a collection of these images
  41              */
  42             for (auto idx : grid_stride_range(output.size())) {
  43                 const index_type n = idx / out_image_size;
  44                 const index_type x = (idx % out_image_size) % out_width;
  45                 const index_type y = (idx % out_image_size) / out_width;
  46
  47                 auto in_x = static_cast<index_type>(x * o2i_fx);
  48                 auto in_y = static_cast<index_type>(y * o2i_fy);
  49
  50                 index_type in_idx = n * in_image_size + in_y * in_width + in_x;
  51                 output[idx] = input[in_idx];
  52             }
  53         }
  54
  55         template <class T>
  56         __global__ void resize_bilinear(
  57             Span<T> output, size_type out_height, size_type out_width,
  58             View<T> input, size_type in_height, size_type in_width,
  59             float o2i_fy, float o2i_fx)
  60         {
  61             auto in_image_size = in_height * in_width;
  62             auto out_image_size = out_height * out_width;
  63
  64             /* think of the output and input as a collection of 2d images with the last axis
  65              * representing the width and the last but one axis representing the height
  66              *
  67              * the remaining axis together form a collection of these images
  68              */
  69             for (auto idx : grid_stride_range(output.size())) {
  70                 const index_type n = idx / out_image_size;
  71                 const index_type x = (idx % out_image_size) % out_width;
  72                 const index_type y = (idx % out_image_size) / out_width;
  73
  74                 auto in_x = x * o2i_fx;
  75                 auto in_y = y * o2i_fy;
  76
  77                 auto in_x0 = static_cast<index_type>(in_x);
  78                 auto in_y0 = static_cast<index_type>(in_y);
  79
  80                 using device::min;
  81                 auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
  82                 auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
  83
  84                 const index_type in_offset_r0 = n * in_image_size + in_y0 * in_width;
  85                 const index_type in_offset_r1 = n * in_image_size + in_y1 * in_width;
  86
  87                 auto v_00 = input[in_offset_r0 + in_x0],
  88                      v_01 = input[in_offset_r0 + in_x1],
  89                      v_10 = input[in_offset_r1 + in_x0],
  90                      v_11 = input[in_offset_r1 + in_x1];
  91
  92                 output[idx] =
  93                     v_00 +
  94                     T(in_y - in_y0) * T(v_10 - v_00) +
  95                     T(in_x - in_x0) * T(v_01 - v_00) +
  96                     T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
  97             }
  98         }
  99     }
 100
 101     template <class T>
 102     void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) {
 103         auto in_height = input.get_axis_size(-2);
 104         auto in_width = input.get_axis_size(-1);
 105
 106         auto out_height = output.get_axis_size(-2);
 107         auto out_width = output.get_axis_size(-1);
 108
 109         auto kernel = raw::resize_nn<T>;
 110         auto policy = make_policy(kernel, output.size(), 0, stream);
 111         launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width);
 112     }
 113
 114     template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>);
 115     template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>);
 116
 117     template <class T>
 118     void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) {
 119         auto in_height = input.get_axis_size(-2);
 120         auto in_width = input.get_axis_size(-1);
 121
 122         auto out_height = output.get_axis_size(-2);
 123         auto out_width = output.get_axis_size(-1);
 124
 125         auto kernel = raw::resize_bilinear<T>;
 126         auto policy = make_policy(kernel, output.size(), 0, stream);
 127         launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
 128     }
 129
 130     template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float);
 131     template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float);
 132
 133 }}}} /* namespace cv::dnn::cuda4dnn::kernels */