From: Yashas Samaga B L <yashas_2010@yahoo.com>
Date: Mon, 9 Dec 2019 19:26:58 +0000 (+0530)
Subject: Merge pull request #16069 from YashasSamaga:cuda4dnn-crop_and_resize
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~27
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3fddd3bf93137b02acab03ca789e64e7c25b213e;p=platform%2Fupstream%2Fopencv.git

Merge pull request #16069 from YashasSamaga:cuda4dnn-crop_and_resize

add CropAndResize layer for CUDA backend

* add CropAndResize layer

* process multiple channels per iteration
---

diff --git a/modules/dnn/src/cuda/crop_and_resize.cu b/modules/dnn/src/cuda/crop_and_resize.cu
new file mode 100644
index 0000000..c7e9510
--- /dev/null
+++ b/modules/dnn/src/cuda/crop_and_resize.cu
@@ -0,0 +1,168 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void crop_and_resize(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes,
+            size_type num_channels)
+        {
+            // input [1, num_channels, in_height, in_width]
+            // output [boxes, num_channels, out_height, out_width]
+
+            const auto in_image_size = in_height * in_width;
+            const auto out_image_size = out_height * out_width;
+            const auto out_box_size = num_channels * out_image_size;
+
+            /* we have to compute the output value for every combination of (box, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the box are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the box and spatial dimensions
+             */
+
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
+             */
+            auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
+             * `num_boxes` boxes and `out_image_size` combinations of (x, y)
+             */
+            auto num_boxes = boxes.size() / 7; /* 7 values per box */
+            auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
+            auto iters_required = num_boxes * iters_per_box;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type box_no = iter / iters_per_box;
+                const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                const index_type box_offset = box_no * 7;
+                const auto left = boxes[box_offset + 3],
+                           top = boxes[box_offset + 4],
+                           right = boxes[box_offset + 5],
+                           bottom = boxes[box_offset + 6];
+
+                const auto box_width = right - left;
+                const auto box_height = bottom - top;
+
+                const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
+                const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
+
+                const auto height_scale = box_height * o2i_fy;
+                const auto width_scale = box_width * o2i_fx;
+
+                const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
+                const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
+
+                const auto in_y0 = static_cast<index_type>(in_y);
+                const auto in_x0 = static_cast<index_type>(in_x);
+
+                using device::min;
+                const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling */
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    auto v_00 = input[in_offset_r0 + in_x0],
+                         v_01 = input[in_offset_r0 + in_x1],
+                         v_10 = input[in_offset_r1 + in_x0],
+                         v_11 = input[in_offset_r1 + in_x1];
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - T(in_y0)) * T(v_10 - v_00) +
+                        T(in_x - T(in_x0)) * T(v_01 - v_00) +
+                        T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_crop_and_resize(const Stream& stream,
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes, size_type num_channels)
+    {
+        auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+    }
+
+    template <class T>
+    void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
+        CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_channels = input.get_axis_size(1);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else {
+            launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        }
+    }
+
+    template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
+    template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
diff --git a/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp b/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
new file mode 100644
index 0000000..046223d
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/span.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void crop_and_resize(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP */
diff --git a/modules/dnn/src/cuda4dnn/kernels/resize.hpp b/modules/dnn/src/cuda4dnn/kernels/resize.hpp
index 5c5cc3d..31aee3d 100644
--- a/modules/dnn/src/cuda4dnn/kernels/resize.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/resize.hpp
@@ -8,8 +8,6 @@
 #include "../csl/stream.hpp"
 #include "../csl/tensor.hpp"
 
-#include <cstddef>
-
 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
     template <class T>
diff --git a/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp b/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
new file mode 100644
index 0000000..f0cfcc7
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
@@ -0,0 +1,51 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include "../kernels/crop_and_resize.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class CropAndResizeOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        CropAndResizeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 2 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto box_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto boxes = box_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            kernels::crop_and_resize(stream, output, input, static_cast<csl::View<T>>(boxes));
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP */
diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
index de87107..ab242e1 100644
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -7,6 +7,11 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/crop_and_resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
 namespace cv { namespace dnn {
 
 class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
@@ -36,6 +41,11 @@ public:
         return false;
     }
 
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
@@ -111,6 +121,18 @@ public:
         }
     }
 
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::CropAndResizeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
 private:
     int outWidth, outHeight;
 };