set(MIN_VER_CMAKE 3.5.1)
endif()
set(MIN_VER_CUDA 6.5)
-set(MIN_VER_CUDNN 6)
+set(MIN_VER_CUDNN 7.5)
set(MIN_VER_PYTHON2 2.7)
set(MIN_VER_PYTHON3 3.2)
set(MIN_VER_ZLIB 1.2.3)
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
list(APPEND include_dirs ${CUDA_TOOLKIT_INCLUDE} ${CUDNN_INCLUDE_DIRS})
+ set(CC_LIST ${CUDA_ARCH_BIN})
+ separate_arguments(CC_LIST)
+ foreach(cc ${CC_LIST})
+ if(cc VERSION_LESS 5.3)
+ message(FATAL_ERROR "CUDA backend for DNN module requires CC 5.3 or higher. Please remove unsupported architectures from CUDA_ARCH_BIN option.")
+ endif()
+ endforeach()
+ unset(CC_LIST)
else()
set(sources_options ${sources_options} EXCLUDE_CUDA)
endif()
DNN_BACKEND_HALIDE,
DNN_BACKEND_INFERENCE_ENGINE, //!< Intel's Inference Engine computational backend.
DNN_BACKEND_OPENCV,
- DNN_BACKEND_VKCOM
+ DNN_BACKEND_VKCOM,
+ DNN_BACKEND_CUDA
};
/**
DNN_TARGET_OPENCL_FP16,
DNN_TARGET_MYRIAD,
DNN_TARGET_VULKAN,
- DNN_TARGET_FPGA //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+ DNN_TARGET_FPGA, //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+ DNN_TARGET_CUDA,
+ DNN_TARGET_CUDA_FP16
};
CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs);
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs);
+
+ /**
+ * @brief Returns a CUDA backend node
+ *
+ * @param context void pointer to CSLContext object
+ * @param inputs layer inputs
+ * @param outputs layer outputs
+ */
+ virtual Ptr<BackendNode> initCUDA(
+ void *context,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ );
+
/**
* @brief Automatic Halide scheduling based on layer hyper-parameters.
* @param[in] node Backend node with Halide functions.
* @see Target
*
* List of supported combinations backend / target:
- * | | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE |
- * |------------------------|--------------------|------------------------------|--------------------|
- * | DNN_TARGET_CPU | + | + | + |
- * | DNN_TARGET_OPENCL | + | + | + |
- * | DNN_TARGET_OPENCL_FP16 | + | + | |
- * | DNN_TARGET_MYRIAD | | + | |
- * | DNN_TARGET_FPGA | | + | |
+ * | | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE | DNN_BACKEND_CUDA |
+ * |------------------------|--------------------|------------------------------|--------------------|-------------------|
+ * | DNN_TARGET_CPU | + | + | + | |
+ * | DNN_TARGET_OPENCL | + | + | + | |
+ * | DNN_TARGET_OPENCL_FP16 | + | + | | |
+ * | DNN_TARGET_MYRIAD | | + | | |
+ * | DNN_TARGET_FPGA | | + | | |
+ * | DNN_TARGET_CUDA | | | | + |
+ * | DNN_TARGET_CUDA_FP16 | | | | + |
*/
CV_WRAP void setPreferableTarget(int targetId);
Backend backendId = get<0>(get<1>(GetParam()));
Target targetId = get<1>(get<1>(GetParam()));
- if (targetId != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (targetId != DNN_TARGET_CPU && backendId != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
int inChannels = inputShape[1];
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t N>
+ __global__ void abs_vec(Span<T> output, View<T> input) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::abs;
+ vec.data[j] = abs(vec.data[j]);
+ }
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void tanh_vec(Span<T> output, View<T> input) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::tanh;
+ vec.data[j] = tanh(vec.data[j]);
+ }
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void sigmoid_vec(Span<T> output, View<T> input) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::sigmoid;
+ vec.data[j] = sigmoid(vec.data[j]);
+ }
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void bnll_vec(Span<T> output, View<T> input) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::log1pexp;
+ vec.data[j] = vec.data[j] > T(0) ? vec.data[j] + log1pexp(-vec.data[j]) : log1pexp(vec.data[j]);
+ }
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void elu_vec(Span<T> output, View<T> input) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::expm1;
+ vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : expm1(vec.data[j]);
+ }
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void relu_vec(Span<T> output, View<T> input, T slope) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for(int j = 0; j < vector_type::size(); j++)
+ vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void clipped_relu_vec(Span<T> output, View<T> input, T floor, T ceiling) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ using device::clamp;
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++)
+ vec.data[j] = clamp(vec.data[j], floor, ceiling);
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ inner_size /= vector_type::size();
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ const index_type c = (i / inner_size) % static_cast<size_type>(slope.size());
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++)
+ vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void power_vec(Span<T> output, View<T> input, T exp, T scale, T shift) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ using device::pow;
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++)
+ vec.data[j] = pow(shift + scale * vec.data[j], exp);
+ v_store(output_vPtr[i], vec);
+ }
+ }
+ }
+
+ template <class T, std::size_t N>
+ void launch_vectorized_abs(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::abs_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input);
+ }
+
+ template <class T>
+ void abs(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_abs<T, 4>(stream, output, input);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_abs<T, 2>(stream, output, input);
+ } else {
+ launch_vectorized_abs<T, 1>(stream, output, input);
+ }
+ }
+
+ template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+ template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_tanh(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::tanh_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input);
+ }
+
+ template <class T>
+ void tanh(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_tanh<T, 4>(stream, output, input);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_tanh<T, 2>(stream, output, input);
+ } else {
+ launch_vectorized_tanh<T, 1>(stream, output, input);
+ }
+ }
+
+ template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
+ template void tanh<float>(const Stream&, Span<float>, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::sigmoid_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input);
+ }
+
+ template <class T>
+ void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_sigmoid<T, 4>(stream, output, input);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_sigmoid<T, 2>(stream, output, input);
+ } else {
+ launch_vectorized_sigmoid<T, 1>(stream, output, input);
+ }
+ }
+
+ template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
+ template void sigmoid<float>(const Stream&, Span<float>, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_bnll(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::bnll_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input);
+ }
+
+ template <class T>
+ void bnll(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_bnll<T, 4>(stream, output, input);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_bnll<T, 2>(stream, output, input);
+ } else {
+ launch_vectorized_bnll<T, 1>(stream, output, input);
+ }
+ }
+
+ template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
+ template void bnll<float>(const Stream&, Span<float>, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_elu(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::elu_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input);
+ }
+
+ template <class T>
+ void elu(const Stream& stream, Span<T> output, View<T> input) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_elu<T, 4>(stream, output, input);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_elu<T, 2>(stream, output, input);
+ } else {
+ launch_vectorized_elu<T, 1>(stream, output, input);
+ }
+ }
+
+ template void elu<__half>(const Stream&, Span<__half>, View<__half>);
+ template void elu<float>(const Stream&, Span<float>, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::relu_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, slope);
+ }
+
+ template <class T>
+ void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+ CV_Assert(input.size() == output.size());
+
+ if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_relu<T, 4>(stream, output, input, slope);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_relu<T, 2>(stream, output, input, slope);
+ } else {
+ launch_vectorized_relu<T, 1>(stream, output, input, slope);
+ }
+ }
+
+ template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+ template void relu<float>(const Stream&, Span<float>, View<float>, float);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::clipped_relu_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, floor, ceiling);
+ }
+
+ template <class T>
+ void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+ CV_Assert(input.size() == output.size());
+ CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+
+ if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_vectorized_clipped_relu<T, 4>(stream, output, input, floor, ceiling);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_vectorized_clipped_relu<T, 2>(stream, output, input, floor, ceiling);
+ } else {
+ launch_vectorized_clipped_relu<T, 1>(stream, output, input, floor, ceiling);
+ }
+ }
+
+ template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+ template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+ CV_Assert(inner_size % N == 0);
+
+ auto kernel = raw::axiswise_relu_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, inner_size, slope);
+ }
+
+ template <class T>
+ void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+ CV_Assert(input.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+ launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+ launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
+ } else {
+ launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
+ }
+ }
+
+ template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
+ template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::power_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, exp, scale, shift);
+ }
+
+ template <class T>
+ void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+ CV_Assert(input.size() == output.size());
+
+ if (static_cast<float>(exp) == 1.0f) {
+ scale1_with_bias1(stream, output, input, scale, shift);
+ return;
+ }
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && output.size()) {
+ launch_vectorized_power<T, 4>(stream, output, input, exp, scale, shift);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && output.size()) {
+ launch_vectorized_power<T, 2>(stream, output, input, exp, scale, shift);
+ } else {
+ launch_vectorized_power<T, 1>(stream, output, input, exp, scale, shift);
+ }
+ }
+
+ template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+ template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ template <class T, std::size_t N>
+ struct array {
+ using value_type = T;
+ using size_type = device::size_type;
+ using difference_type = std::ptrdiff_t;
+ using reference = typename std::add_lvalue_reference<value_type>::type;
+ using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
+ using pointer = typename std::add_pointer<value_type>::type;
+ using const_pointer = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
+ using iterator = pointer;
+ using const_iterator = const_pointer;
+ using reverse_iterator = std::reverse_iterator<iterator>;
+ using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+ __host__ __device__ bool empty() const noexcept { return N == 0; }
+ __host__ __device__ size_type size() const noexcept { return N; }
+
+ __host__ __device__ iterator begin() noexcept { return ptr; }
+ __host__ __device__ iterator end() noexcept { return ptr + N; }
+ __host__ __device__ const_iterator begin() const noexcept { return ptr; }
+ __host__ __device__ const_iterator end() const noexcept { return ptr + N; }
+
+ __host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
+ __host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
+
+ __host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
+ __host__ __device__ reverse_iterator rend() noexcept { return ptr; }
+ __host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
+ __host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
+
+ __host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
+ __host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
+
+ template <class InputItr>
+ __host__ void assign(InputItr first, InputItr last) {
+ std::copy(first, last, std::begin(ptr));
+ }
+
+ __host__ __device__ reference operator[](int idx) { return ptr[idx]; }
+ __host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
+
+ __host__ __device__ reference front() { return ptr[0]; }
+ __host__ __device__ const_reference front() const { return ptr[0]; }
+
+ __host__ __device__ reference back() { return ptr[N - 1]; }
+ __host__ __device__ const_reference back() const { return ptr[N - 1]; }
+
+ __host__ __device__ pointer data() noexcept { return ptr; }
+ __host__ __device__ const_pointer data() const noexcept { return ptr; }
+
+ T ptr[N];
+ };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#else
+inline __device__ void atomicAdd(__half* address, __half val) {
+ unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
+ unsigned int old = *address_as_ui;
+ unsigned int assumed;
+
+ do {
+ assumed = old;
+
+ __half_raw hsum;
+ hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+ __half tmpres = hsum + val;
+ hsum = __half_raw(tmpres);
+
+ old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+ old = atomicCAS(address_as_ui, assumed, old);
+ } while (assumed != old);
+}
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t N>
+ __global__ void concat_vec(
+ Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+ View<T> input, size_type input_axis_size, size_type concat_size)
+ {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ /* we need to copy all the elements of input to some location in the output
+ * we copy blocks of size `total_concat_size` to some location in the output
+ */
+ const auto total_concat_size = concat_size * input_axis_size;
+
+ for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
+ const index_type idx = in_idx * vector_type::size();
+ const index_type concat_num = idx / total_concat_size;
+ const index_type concat_index = idx % total_concat_size;
+ const index_type top_index = concat_index +
+ (concat_num * output_axis_size + output_axis_offset) * concat_size;
+
+ const auto out_idx = top_index / vector_type::size();
+
+ vector_type vec;
+ v_load(vec, input_vPtr[in_idx]);
+ v_store(output_vPtr[out_idx], vec);
+ }
+ }
+
+ template <class T, std::size_t Rank>
+ __global__ void concat_with_offsets(
+ Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
+ View<T> input, array<size_type, Rank> in_strides)
+ {
+ for (auto i : grid_stride_range(input.size())) {
+ index_type in_index = i / in_strides[0];
+ index_type out_index = out_offset[0] + in_index;
+ index_type oidx = out_index * out_strides[0];
+ for (int j = 1; j < Rank; j++) {
+ in_index = (i % in_strides[j - 1]) / in_strides[j];
+ out_index = out_offset[j] + in_index;
+ oidx += out_index * out_strides[j];
+ }
+
+ output[oidx] = input[i];
+ }
+ }
+ }
+
+ template <class T, std::size_t N> static
+ void launch_vectorized_concat(const Stream& stream,
+ Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+ View<T> input, size_type input_axis_size, size_type concat_size)
+ {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+ /* more assertions are required to fully check for vectorization possiblity; check concat() */
+
+ auto kernel = raw::concat_vec<T, N>;
+ auto policy = make_policy(kernel, input.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+ }
+
+ template <class T>
+ void concat(
+ const Stream& stream,
+ TensorSpan<T> output, std::size_t output_axis_offset,
+ TensorView<T> input, std::size_t axis)
+ {
+ /* let's call the axis of interest as the channel axis for the purpose of the following discussion
+ * even though it can be any axis
+ *
+ * for each batch item:
+ * we move all the channels from the input (which together, for a single batch item, is contiguous)
+ * of a batch item to its corresponding contiguous place in the output
+ *
+ * for a valid vector operation:
+ * - the size of each copy block must be aligned
+ * - input must be aligned
+ * - all the destination locations in the output must be aligned
+ */
+ std::size_t concat_size = output.size_range(axis + 1, output.rank());
+
+ std::size_t input_axis_size = input.get_axis_size(axis);
+ std::size_t output_axis_size = output.get_axis_size(axis);
+
+ std::size_t copy_block_size = concat_size * input_axis_size;
+ std::size_t copy_block_stride = concat_size * output_axis_size;
+ std::size_t starting_offset = output_axis_offset * concat_size;
+
+ /* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
+ * to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
+ */
+
+ bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
+ bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
+ launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
+ launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+ } else {
+ launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+ }
+ }
+
+ template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
+ template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t);
+
+ template <class T, std::size_t Rank> static
+ void launch_concat_with_offsets(
+ const Stream& stream,
+ Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
+ View<T> input, const std::vector<std::size_t>& inStride)
+ {
+ CV_Assert(outStride.size() == Rank);
+ CV_Assert(outOffset.size() == Rank);
+ CV_Assert(inStride.size() == Rank);
+
+ array<size_type, Rank> outStride_k, inStride_k;
+ outStride_k.assign(std::begin(outStride), std::end(outStride));
+ inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+ array<index_type, Rank> outOffset_k;
+ outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
+
+ auto kernel = raw::concat_with_offsets<T, Rank>;
+ auto policy = make_policy(kernel, input.size(), 0, stream);
+ launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
+ }
+
+ GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
+
+ template <class T>
+ void concat_with_offsets(
+ const Stream& stream,
+ TensorSpan<T> output, TensorView<T> input,
+ std::vector<std::size_t> offsets)
+ {
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == offsets.size());
+
+ /* squeezable axes at the begining of both tensors can be eliminated
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
+ * tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
+ * from the input tensor to new locations in the output tensor.
+ *
+ * If the size of the first axis of the input and output tensor is unity, the input and output
+ * indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
+ * respectively. The first index does not contribute to the element's address calculation and
+ * hence does nothing apart from eating up few cycles.
+ */
+ while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+ CV_Assert(offsets[0] == 0);
+
+ input.squeeze(0);
+ output.squeeze(0);
+ offsets.erase(std::begin(offsets));
+
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == offsets.size());
+ }
+
+ auto inShape = input.shape_as_vector();
+ auto outShape = output.shape_as_vector();
+
+ /* contiguous axes that undergo full copy can be combined into one axis
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
+ * concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+ *
+ * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+ * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+ * a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
+ * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+ */
+ for (int i = 0; i < inShape.size(); i++) {
+ /* check if axis `i` requires any slicing */
+ if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+ /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+ int j = i + 1; /* `j` is the axis which we will attempt to merge */
+ while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+ /* `j` axis is also copied fully; merge `i` and `j` */
+ auto new_size = inShape[i] * inShape[j];
+ inShape[i] = new_size;
+ outShape[i] = new_size;
+ offsets[i] = 0; /* redundant */
+
+ /* delete axis `j` */
+ inShape.erase(std::begin(inShape) + j);
+ outShape.erase(std::begin(outShape) + j);
+ offsets.erase(std::begin(offsets) + j);
+
+ /* optimizations should not break the invariants */
+ CV_Assert(inShape.size() == outShape.size());
+ CV_Assert(inShape.size() == offsets.size());
+ CV_Assert(inShape[i] == outShape[i]);
+ CV_Assert(offsets[i] == 0);
+ }
+ }
+ }
+
+ auto rank = inShape.size();
+
+ std::vector<std::size_t> inStride(rank), outStride(rank);
+ inStride.back() = 1;
+ outStride.back() = 1;
+ /* garbage, ..., garbage, 1 */
+
+ std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+ std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+ /* dim[0], dim[1], ..., dim[-1], 1 */
+
+ std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+ std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+ /* stride[0], stride[1], ..., stride[-2], 1 */
+
+ CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+ concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
+ }
+
+ template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+ template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t N>
+ __global__ void eltwise_max_2_vec(Span<T> output, View<T> x, View<T> y) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto x_vPtr = vector_type::get_pointer(x.data());
+ auto y_vPtr = vector_type::get_pointer(y.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec_x, vec_y;
+ v_load(vec_x, x_vPtr[i]);
+ v_load(vec_y, y_vPtr[i]);
+
+ for (int j = 0; j < vector_type::size(); j++) {
+ using device::max;
+ vec_x.data[j] = max(vec_x.data[j], vec_y.data[j]);
+ }
+
+ v_store(output_vPtr[i], vec_x);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void eltwise_sum_2_vec(Span<T> output, View<T> x, View<T> y) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto x_vPtr = vector_type::get_pointer(x.data());
+ auto y_vPtr = vector_type::get_pointer(y.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec_x, vec_y;
+ v_load(vec_x, x_vPtr[i]);
+ v_load(vec_y, y_vPtr[i]);
+
+ for (int j = 0; j < vector_type::size(); j++)
+ vec_x.data[j] = vec_x.data[j] + vec_y.data[j];
+
+ v_store(output_vPtr[i], vec_x);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void eltwise_sum_coeff_2_vec(Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto x_vPtr = vector_type::get_pointer(x.data());
+ auto y_vPtr = vector_type::get_pointer(y.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec_x, vec_y;
+ v_load(vec_x, x_vPtr[i]);
+ v_load(vec_y, y_vPtr[i]);
+
+ for (int j = 0; j < vector_type::size(); j++)
+ vec_x.data[j] = coeff_x * vec_x.data[j] + coeff_y * vec_y.data[j];
+
+ v_store(output_vPtr[i], vec_x);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void eltwise_prod_2_vec(Span<T> output, View<T> x, View<T> y) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto x_vPtr = vector_type::get_pointer(x.data());
+ auto y_vPtr = vector_type::get_pointer(y.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec_x, vec_y;
+ v_load(vec_x, x_vPtr[i]);
+ v_load(vec_y, y_vPtr[i]);
+
+ for (int j = 0; j < vector_type::size(); j++)
+ vec_x.data[j] = vec_x.data[j] * vec_y.data[j];
+
+ v_store(output_vPtr[i], vec_x);
+ }
+ }
+ }
+
+ template <class T, std::size_t N>
+ void launch_vectorized_eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(x, N));
+ CV_Assert(is_fully_aligned<T>(y, N));
+
+ auto kernel = raw::eltwise_max_2_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, x, y);
+ }
+
+ template <class T>
+ void eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(x.size() == y.size());
+ CV_Assert(x.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+ launch_vectorized_eltwise_max_2<T, 4>(stream, output, x, y);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+ launch_vectorized_eltwise_max_2<T, 2>(stream, output, x, y);
+ } else {
+ launch_vectorized_eltwise_max_2<T, 1>(stream, output, x, y);
+ }
+ }
+
+ template void eltwise_max_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+ template void eltwise_max_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(x, N));
+ CV_Assert(is_fully_aligned<T>(y, N));
+
+ auto kernel = raw::eltwise_sum_2_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, x, y);
+ }
+
+ template <class T>
+ void eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(x.size() == y.size());
+ CV_Assert(x.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+ launch_vectorized_eltwise_sum_2<T, 4>(stream, output, x, y);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+ launch_vectorized_eltwise_sum_2<T, 2>(stream, output, x, y);
+ } else {
+ launch_vectorized_eltwise_sum_2<T, 1>(stream, output, x, y);
+ }
+ }
+
+ template void eltwise_sum_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+ template void eltwise_sum_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(x, N));
+ CV_Assert(is_fully_aligned<T>(y, N));
+
+ auto kernel = raw::eltwise_sum_coeff_2_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y);
+ }
+
+ template <class T>
+ void eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+ CV_Assert(x.size() == y.size());
+ CV_Assert(x.size() == output.size());
+
+ if (static_cast<float>(coeff_x) == 1.0f && static_cast<float>(coeff_y) == 1.0f) {
+ eltwise_sum_2(stream, output, x, y);
+ return;
+ }
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+ launch_vectorized_eltwise_sum_coeff_2<T, 4>(stream, output, coeff_x, x, coeff_y, y);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+ launch_vectorized_eltwise_sum_coeff_2<T, 2>(stream, output, coeff_x, x, coeff_y, y);
+ } else {
+ launch_vectorized_eltwise_sum_coeff_2<T, 1>(stream, output, coeff_x, x, coeff_y, y);
+ }
+ }
+
+ template void eltwise_sum_coeff_2(const Stream&, Span<__half>, __half, View<__half>, __half, View<__half>);
+ template void eltwise_sum_coeff_2(const Stream&, Span<float>, float, View<float>, float, View<float>);
+
+ template <class T, std::size_t N>
+ void launch_vectorized_eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(x, N));
+ CV_Assert(is_fully_aligned<T>(y, N));
+
+ auto kernel = raw::eltwise_prod_2_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, x, y);
+ }
+
+ template <class T>
+ void eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+ CV_Assert(x.size() == y.size());
+ CV_Assert(x.size() == output.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+ launch_vectorized_eltwise_prod_2<T, 4>(stream, output, x, y);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+ launch_vectorized_eltwise_prod_2<T, 2>(stream, output, x, y);
+ } else {
+ launch_vectorized_eltwise_prod_2<T, 1>(stream, output, x, y);
+ }
+ }
+
+ template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+ template void eltwise_prod_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+
+#include "../cuda4dnn/csl/error.hpp"
+#include "../cuda4dnn/csl/stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ struct execution_policy {
+ execution_policy(dim3 grid_size, dim3 block_size)
+ : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
+
+ execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
+ : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
+
+ execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
+ : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
+
+ execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
+ : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
+
+ dim3 grid;
+ dim3 block;
+ std::size_t sharedMem;
+ cudaStream_t stream;
+ };
+
+ /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
+ /*
+ template <class Kernel> inline
+ execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
+ int grid_size, block_size;
+ CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+ return execution_policy(grid_size, block_size, sharedMem, stream);
+ }*/
+
+ template <class Kernel> inline
+ execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
+ CV_Assert(max_threads > 0);
+
+ int grid_size = 0, block_size = 0;
+ CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+ if (grid_size * block_size > max_threads) {
+ grid_size = (max_threads + block_size - 1) / block_size;
+ if (block_size > max_threads)
+ block_size = max_threads;
+ }
+
+ CV_Assert(grid_size >= 1 && block_size >= 1);
+ return execution_policy(grid_size, block_size, sharedMem, stream);
+ }
+
+ template <class Kernel, typename ...Args> inline
+ void launch_kernel(Kernel kernel, Args ...args) {
+ auto policy = make_policy(kernel);
+ kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
+ }
+
+ template <class Kernel, typename ...Args> inline
+ void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
+ kernel <<<grid, block>>> (std::forward<Args>(args)...);
+ }
+
+ template <class Kernel, typename ...Args> inline
+ void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
+ kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
+ }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t N>
+ __global__ void fill_vec(Span<T> output, T value) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ for (int j = 0; j < vector_type::size(); j++)
+ vec.data[j] = value;
+ v_store(output_vPtr[i], vec);
+ }
+ }
+ }
+
+ template <class T, std::size_t N>
+ void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+
+ auto kernel = raw::fill_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, value);
+ }
+
+ template <class T>
+ void fill(const Stream& stream, Span<T> output, T value) {
+ if (is_fully_aligned<T>(output, 4)) {
+ launch_vectorized_fill<T, 4>(stream, output, value);
+ } else if (is_fully_aligned<T>(output, 2)) {
+ launch_vectorized_fill<T, 2>(stream, output, value);
+ } else {
+ launch_vectorized_fill<T, 1>(stream, output, value);
+ }
+ }
+
+ template void fill(const Stream&, Span<__half>, __half);
+ template void fill(const Stream&, Span<float>, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ namespace detail {
+ template <int> __device__ auto getGridDim()->decltype(dim3::x);
+ template <> inline __device__ auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; }
+ template <> inline __device__ auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; }
+ template <> inline __device__ auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; }
+
+ template <int> __device__ auto getBlockDim()->decltype(dim3::x);
+ template <> inline __device__ auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; }
+ template <> inline __device__ auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; }
+ template <> inline __device__ auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; }
+
+ template <int> __device__ auto getBlockIdx()->decltype(uint3::x);
+ template <> inline __device__ auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; }
+ template <> inline __device__ auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; }
+ template <> inline __device__ auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; }
+
+ template <int> __device__ auto getThreadIdx()->decltype(uint3::x);
+ template <> inline __device__ auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; }
+ template <> inline __device__ auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; }
+ template <> inline __device__ auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; }
+ }
+
+ template <int dim, class index_type = device::index_type, class size_type = device::size_type>
+ class grid_stride_range_generic {
+ public:
+ __device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
+ __device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+ class iterator
+ {
+ public:
+ __device__ iterator(index_type pos_) : pos(pos_) {}
+
+ /* these iterators return the index when dereferenced; this allows us to loop
+ * through the indices using a range based for loop
+ */
+ __device__ index_type operator*() const { return pos; }
+
+ __device__ iterator& operator++() {
+ pos += detail::getGridDim<dim>() * static_cast<index_type>(detail::getBlockDim<dim>());
+ return *this;
+ }
+
+ __device__ bool operator!=(const iterator& other) const {
+ /* NOTE HACK
+ ** 'pos' can move in large steps (see operator++)
+ ** expansion of range for loop uses != as the loop conditioion
+ ** => operator!= must return false if 'pos' crosses the end
+ */
+ return pos < other.pos;
+ }
+
+ private:
+ index_type pos;
+ };
+
+ __device__ iterator begin() const {
+ using detail::getBlockDim;
+ using detail::getBlockIdx;
+ using detail::getThreadIdx;
+ return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
+ }
+
+ __device__ iterator end() const {
+ return iterator(to);
+ }
+
+ private:
+ index_type from, to;
+ };
+
+ using grid_stride_range_x = grid_stride_range_generic<0>;
+ using grid_stride_range_y = grid_stride_range_generic<1>;
+ using grid_stride_range_z = grid_stride_range_generic<2>;
+ using grid_stride_range = grid_stride_range_x;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
+ * one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
+ * tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
+ * toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
+ * rank as a template parameter.
+ *
+ * The kernel is a template and we have different instantiations for each rank. This causes the following pattern
+ * to arise frequently:
+ *
+ * if(rank == 3)
+ * kernel<T, 3>();
+ * else if(rank == 2)
+ * kernel<T, 2>();
+ * else
+ * kernel<T, 1>();
+ *
+ * The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
+ * This macro creates a function which selects the correct kernel instantiation at runtime.
+ *
+ * Example:
+ *
+ * // function which setups the kernel and launches it
+ * template <class T, std::size_t Rank>
+ * void launch_some_kernel(...);
+ *
+ * // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
+ * GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
+ *
+ * // internal API function
+ * template <class T>
+ * void some(...) {
+ * // ...
+ * auto rank = input.rank();
+ * some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
+ * }
+ */
+
+/*
+ * name name of the dispatcher function that is generated
+ * func template function that requires runtime selection
+ *
+ * T first template parameter to `func`
+ * start starting rank
+ * end ending rank (inclusive)
+ *
+ * Executes func<T, selector> based on runtime `selector` argument given `selector` lies
+ * within the range [start, end]. If outside the range, no instantiation of `func` is executed.
+ */
+#define GENERATE_KERNEL_DISPATCHER(name,func); \
+ template <class T, std::size_t start, std::size_t end, class... Args> static \
+ typename std::enable_if<start == end, void> \
+ ::type name(int selector, Args&& ...args) { \
+ if(selector == start) \
+ func<T, start>(std::forward<Args>(args)...); \
+ } \
+ \
+ template <class T, std::size_t start, std::size_t end, class... Args> static \
+ typename std::enable_if<start != end, void> \
+ ::type name(int selector, Args&& ...args) { \
+ if(selector == start) \
+ func<T, start>(std::forward<Args>(args)...); \
+ else \
+ name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
+ }
+
+#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <cfloat>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ template <class T>
+ struct numeric_limits;
+
+ template <>
+ struct numeric_limits<__half> {
+ __device__ static __half min() { return 0.0000610; }
+ __device__ static __half max() { return 65504.0; }
+ __device__ static __half lowest() { return -65504.0; }
+ };
+
+ template <>
+ struct numeric_limits<float> {
+ __device__ static float min() { return FLT_MIN; }
+ __device__ static float max() { return FLT_MAX; }
+ __device__ static float lowest() { return -FLT_MAX; }
+ };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
+#define OPENCV_DNN_SRC_CUDA_MATH_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
+ template <> inline __device__ __half2 abs(__half2 val) {
+ val.x = abs(val.x);
+ val.y = abs(val.y);
+ return val;
+ }
+ template <> inline __device__ float abs(float val) { return fabsf(val); }
+ template <> inline __device__ double abs(double val) { return fabs(val); }
+
+ template <class T> __device__ T exp(T val);
+ template <> inline __device__ __half exp(__half val) { return hexp(val); }
+ template <> inline __device__ __half2 exp(__half2 val) { return h2exp(val); }
+ template <> inline __device__ float exp(float val) { return expf(val); }
+ template <> inline __device__ double exp(double val) { return ::exp(val); }
+
+ template <class T> __device__ T expm1(T val);
+ template <> inline __device__ __half expm1(__half val) { return hexp(val) + __half(1); }
+ template <> inline __device__ __half2 expm1(__half2 val) { return h2exp(val) + __half2(1, 1); }
+ template <> inline __device__ float expm1(float val) { return expm1f(val); }
+ template <> inline __device__ double expm1(double val) { return ::expm1(val); }
+
+ template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
+ template <> inline __device__ __half2 max(__half2 a, __half2 b) {
+ a.x = max(a.x, a.x);
+ a.y = max(a.y, b.y);
+ return a;
+ }
+ template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
+ template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
+
+ template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
+ template <> inline __device__ __half2 min(__half2 a, __half2 b) {
+ a.x = min(a.x, a.x);
+ a.y = min(a.y, b.y);
+ return a;
+ }
+ template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
+ template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
+
+ template <class T> __device__ T log1p(T val);
+ template <> inline __device__ __half log1p(__half val) { return hlog(val) + __half(1); }
+ template <> inline __device__ __half2 log1p(__half2 val) { return h2log(val) + __half2(1, 1); }
+ template <> inline __device__ float log1p(float val) { return log1pf(val); }
+
+ template <class T> __device__ T log1pexp(T val);
+ template <> inline __device__ __half log1pexp(__half val) {
+ if (val <= __half(-4.0))
+ return exp(val);
+ else if (val <= __half(8.0))
+ return log1p(exp(val));
+ else if (val <= __half(8.7))
+ return val + exp(-val);
+ else
+ return val;
+ }
+ template <> inline __device__ __half2 log1pexp(__half2 val) {
+ val.x = log1pexp(val.x);
+ val.y = log1pexp(val.y);
+ return val;
+ }
+ template <> inline __device__ float log1pexp(float val) {
+ if (val <= -20)
+ return expf(val);
+ else if (val <= 9.0)
+ return log1pf(expf(val));
+ else if (val <= 14.6)
+ return val + exp(-val);
+ else
+ return val;
+ }
+ template <> inline __device__ double log1pexp(double val) {
+ if (val <= -37)
+ return exp(val);
+ else if (val <= 18)
+ return log1p(exp(val));
+ else if (val <= 33.3)
+ return val + exp(-val);
+ else
+ return val;
+ }
+
+ template <class T> __device__ T tanh(T val);
+ template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
+ template <> inline __device__ __half2 tanh(__half2 val) { return __half2(tanh(val.x), tanh(val.y)); }
+ template <> inline __device__ float tanh(float val) { return tanhf(val); }
+ template <> inline __device__ double tanh(double val) { return ::tanh(val); }
+
+ template <class T> __device__ T pow(T val, T exp);
+ template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
+ template <> inline __device__ __half2 pow(__half2 val, __half2 exp) { return __half2(pow(val.x, exp.x), pow(val.y, exp.y)); }
+ template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
+ template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
+
+ template <class T> __device__ T sqrt(T val);
+ template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
+ template <> inline __device__ __half2 sqrt(__half2 val) { return h2sqrt(val); }
+ template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
+ template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
+
+ template <class T> __device__ T rsqrt(T val);
+ template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
+ template <> inline __device__ __half2 rsqrt(__half2 val) { return h2rsqrt(val); }
+ template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
+ template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
+
+ template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
+ template <> inline __device__ __half2 sigmoid(__half2 val) { return __half2(1, 1) / (__half2(1, 1) + exp(__hneg2(val))); }
+
+ template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "array.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <type_traits>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t Order,
+ typename std::enable_if<Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
+ __global__ void max_pooling_with_indices(
+ Span<T> output, Span<T> indices, View<T> input, size_type channels,
+ array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+ array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+ {
+ /* every element in the output is mapped to a window in the input and each thread processes several windows */
+ for (auto idx : grid_stride_range(output.size())) {
+ size_type out_spatial_size = 1;
+ array<index_type, Order> window_idx;
+ for (int i = Order - 1; i >= 0; i--) {
+ window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
+ out_spatial_size *= out_spatial_dims[i];
+ }
+
+ const index_type n = idx / (out_spatial_size * channels);
+ const index_type c = (idx / out_spatial_size) % channels;
+
+ array<index_type, Order> start;
+ for(int i = 0; i < Order; i++)
+ start[i] = window_idx[i] * strides[i] - padding_left[i];
+
+ array<index_type, Order> end;
+ for (int i = 0; i < Order; i++) {
+ using device::min;
+ end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
+ }
+
+ for (int i = 0; i < Order; i++) {
+ using device::max;
+ start[i] = max(start[i], 0);
+ }
+
+ T max_value = numeric_limits<T>::lowest();
+ index_type max_idx = -1;
+
+ size_type in_spatial_size = 1;
+ for (int i = 0; i < Order; i++)
+ in_spatial_size *= in_spatial_dims[i];
+
+ const auto outer_offset = (n * channels + c) * in_spatial_size;
+ if (Order == 2) {
+ array<index_type, Order> idx;
+ for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+ for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+ index_type offset = 0;
+ index_type stride = 1;
+ for (int i = Order - 1; i >= 0; i--) {
+ offset += stride * idx[i];
+ stride *= in_spatial_dims[i];
+ }
+
+ if (input[outer_offset + offset] > max_value) {
+ max_idx = offset;
+ max_value = input[outer_offset + offset];
+ }
+ }
+ }
+ } else if(Order == 3) {
+ array<index_type, Order> idx;
+ for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+ for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+ for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
+ index_type offset = 0;
+ index_type stride = 1;
+ for (int i = Order - 1; i >= 0; i--) {
+ offset += stride * idx[i];
+ stride *= in_spatial_dims[i];
+ }
+
+ if (input[outer_offset + offset] > max_value) {
+ max_idx = offset;
+ max_value = input[outer_offset + offset];
+ }
+ }
+ }
+ }
+ }
+
+ output[idx] = max_value;
+ indices[idx] = max_idx;
+ }
+ }
+
+ template <class T, std::size_t Order>
+ __global__ void max_unpooling(
+ Span<T> output, View<T> input, View<T> indices, size_type channels,
+ array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+ array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+ {
+ /* the output has already been zero filled */
+ /* Every input value represents a window in the output. The max unpooling operation
+ * copies the input value to exactly one location in the output window which is given
+ * by the indices tensor.
+ */
+ for (auto idx : grid_stride_range(input.size())) {
+ size_type in_spatial_size = 1;
+ array<index_type, Order> window_idx;
+ for (int i = Order - 1; i >= 0; i--) {
+ window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
+ in_spatial_size *= in_spatial_dims[i];
+ }
+
+ const index_type n = idx / (in_spatial_size * channels);
+ const index_type c = (idx / in_spatial_size) % channels;
+
+ array<index_type, Order> start;
+ for (int i = 0; i < Order; i++) {
+ using device::min;
+ using device::max;
+ start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
+ }
+
+ size_type out_spatial_size = 1;
+ for (int i = 0; i < Order; i++)
+ out_spatial_size *= out_spatial_dims[i];
+
+ index_type outer_offset = (n * channels + c) * out_spatial_size;
+ output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
+ }
+ }
+ }
+
+ template <class T, std::size_t Order> static
+ void launch_max_pooling_kernel(
+ const Stream& stream,
+ Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
+ const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+ const std::vector<std::size_t>& window_size,
+ const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+ {
+ CV_Assert(indices.size() == output.size());
+ CV_Assert(out_spatial_dims.size() == Order);
+ CV_Assert(in_spatial_dims.size() == Order);
+ CV_Assert(window_size.size() == Order);
+ CV_Assert(strides.size() == Order);
+ CV_Assert(padding_left.size() == Order);
+
+ array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+ out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+ in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+ array<size_type, Order> window_size_k, strides_k, padding_left_k;
+ window_size_k.assign(std::begin(window_size), std::end(window_size));
+ strides_k.assign(std::begin(strides), std::end(strides));
+ padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+ auto kernel = raw::max_pooling_with_indices<T, Order>;
+ auto policy = make_policy(kernel, output.size(), 0, stream);
+ launch_kernel(kernel, policy, output, indices, input, channels,
+ out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+ }
+
+ template <class T>
+ void max_pooling_with_indices(
+ const Stream& stream,
+ TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
+ const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+ const std::vector<std::size_t>& padding_left)
+ {
+ CV_Assert(is_shape_same(output, indices));
+ CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+ auto order = window_size.size();
+ CV_Assert(strides.size() == order);
+ CV_Assert(padding_left.size() == order);
+ CV_Assert(output.rank() == order + 2);
+ CV_Assert(input.rank() == order + 2);
+
+ std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+ for (int i = 0; i < order; i++) {
+ in_spatial_dims[i] = input.get_axis_size(2 + i);
+ out_spatial_dims[i] = output.get_axis_size(2 + i);
+ }
+
+ /* only max_pooling2d and max_pooling3d are supported */
+ CV_Assert(2 <= order && order <= 3);
+ std::size_t channels = input.get_axis_size(1);
+ if (order == 3) {
+ launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
+ out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+ } else if (order == 2) {
+ launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
+ out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+ }
+ }
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<float>, TensorSpan<float>, TensorView<float>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template <class T, std::size_t Order> static
+ void launch_max_unpooling_kernel(
+ const Stream& stream,
+ Span<T> output, View<T> input, View<T> indices, std::size_t channels,
+ const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+ const std::vector<std::size_t>& window_size,
+ const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+ {
+ CV_Assert(out_spatial_dims.size() == Order);
+ CV_Assert(in_spatial_dims.size() == Order);
+ CV_Assert(window_size.size() == Order);
+ CV_Assert(strides.size() == Order);
+ CV_Assert(padding_left.size() == Order);
+ CV_Assert(indices.size() == input.size());
+
+ array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+ out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+ in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+ array<size_type, Order> window_size_k, strides_k, padding_left_k;
+ window_size_k.assign(std::begin(window_size), std::end(window_size));
+ strides_k.assign(std::begin(strides), std::end(strides));
+ padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+ auto kernel = raw::max_unpooling<T, Order>;
+ auto policy = make_policy(kernel, input.size(), 0, stream);
+ launch_kernel(kernel, policy, output, input, indices, channels,
+ out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+ }
+
+ template <class T>
+ void max_unpooling(
+ const Stream& stream,
+ TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
+ const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+ const std::vector<std::size_t>& padding_left)
+ {
+ CV_Assert(is_shape_same(input, indices));
+ CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+ auto order = window_size.size();
+ CV_Assert(strides.size() == order);
+ CV_Assert(padding_left.size() == order);
+ CV_Assert(output.rank() == order + 2);
+ CV_Assert(input.rank() == order + 2);
+
+ std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+ for (int i = 0; i < order; i++) {
+ in_spatial_dims[i] = input.get_axis_size(2 + i);
+ out_spatial_dims[i] = output.get_axis_size(2 + i);
+ }
+
+ kernels::fill<T>(stream, output, 0.0);
+
+ /* only max_unpooling2d and max_unpooling3d are supported */
+ CV_Assert(2 <= order && order <= 3);
+ std::size_t channels = input.get_axis_size(1);
+ if (order == 3) {
+ launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
+ out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+ } else if (order == 2) {
+ launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
+ out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+ }
+ }
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<float>, TensorView<float>, TensorView<float>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill.hpp"
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T>
+ __global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+ for (auto idx : grid_stride_range(input.size())) {
+ const index_type outer_idx = idx / outer_stride;
+ const index_type inner_idx = idx % mid_stride;
+
+ const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+ atomicAdd(&output[sum_idx], device::abs(input[idx]));
+ }
+ }
+
+ template <class T>
+ __global__ void reciprocal(Span<T> output, T epsilon) {
+ for (auto idx : grid_stride_range(output.size()))
+ output[idx] = T(1) / (output[idx] + epsilon);
+ }
+
+ template <class T>
+ __global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+ for (auto idx : grid_stride_range(input.size())) {
+ const index_type outer_idx = idx / outer_stride;
+ const index_type inner_idx = idx % mid_stride;
+
+ const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+ atomicAdd(&output[sum_idx], input[idx] * input[idx]);
+ }
+ }
+
+ template <class T>
+ __global__ void rsqrt(Span<T> output, T epsilon) {
+ for (auto idx : grid_stride_range(output.size())) {
+ using device::sqrt;
+ output[idx] = T(1) / sqrt(output[idx] + epsilon);
+ }
+ }
+
+ template <class T>
+ __global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
+ for (auto idx : grid_stride_range(output.size())) {
+ const index_type outer_idx = idx / outer_stride;
+ const index_type inner_idx = idx % mid_stride;
+
+ const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+ output[idx] = input[idx] * sums[sum_idx];
+ }
+ }
+ }
+
+ template <class T>
+ void normalize(
+ const Stream& stream,
+ Span<T> output,
+ View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+ Span<T> workspace)
+ {
+ CV_Assert(output.size() == input.size());
+ CV_Assert(output.size() == outer_size * mid_size * inner_size);
+ CV_Assert(norm == 1 || norm == 2);
+ CV_Assert(workspace.size() >= outer_size * inner_size);
+
+ auto sums = Span<T>(workspace.data(), outer_size * inner_size);
+
+ fill<T>(stream, sums, 0.0);
+
+ if (norm == 1) {
+ auto reduce_kernel = raw::reduce_sum_abs<T>;
+ auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+ launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+ auto reciprocal_kernel = raw::reciprocal<T>;
+ policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
+ launch_kernel(reciprocal_kernel, policy, sums, epsilon);
+ } else {
+ auto reduce_kernel = raw::reduce_sum_squared<T>;
+ auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+ launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+ auto rsqrt_kernel = raw::rsqrt<T>;
+ policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
+ launch_kernel(rsqrt_kernel, policy, sums, epsilon);
+ }
+
+ auto scale_kernel = raw::apply_norm<T>;
+ auto policy = make_policy(scale_kernel, output.size(), 0, stream);
+ launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
+ }
+
+ template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
+ template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t Rank>
+ __global__ void copy_with_reflection101(
+ Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
+ View<T> input, array<size_type, Rank> in_strides)
+ {
+ for (auto i : grid_stride_range(output.size())) {
+ /* compute output axis indices corresponding to element 'i' */
+ array<index_type, Rank> out_index;
+ out_index[0] = i / out_strides[0];
+ for (int j = 1; j < Rank; j++)
+ out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
+
+ /* compute input axis indices corresponding to output axis indices */
+ array<index_type, Rank> in_index;
+ for (int j = 0; j < Rank; j++) {
+ /* if out_index < start, the point is in the left reflection region
+ * the reflected value's index is the absolute value of the difference
+ *
+ * otherwise, if the value is in the copy region, out_index - start gives the input index
+ */
+ using device::abs;
+ in_index[j] = abs(out_index[j] - start[j]);
+
+ /* if out_index >= end, it's in the right reflection region */
+ if (out_index[j] >= end[j])
+ in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
+ }
+
+ /* compute input element number from input axis indices */
+ index_type iidx = 0;
+ for (int j = 0; j < Rank; j++)
+ iidx += in_index[j] * in_strides[j];
+
+ output[i] = input[iidx];
+ }
+ }
+ }
+
+ template <class T, std::size_t Rank> static
+ void launch_copy_with_reflection101(
+ const Stream& stream,
+ Span<T> output, const std::vector<std::size_t>& outStride,
+ View<T> input, const std::vector<std::size_t>& inStride,
+ const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
+ {
+ CV_Assert(outStride.size() == Rank);
+ CV_Assert(inStride.size() == Rank);
+ CV_Assert(ranges.size() == Rank);
+
+ array<size_type, Rank> outStride_k, inStride_k;
+ outStride_k.assign(std::begin(outStride), std::end(outStride));
+ inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+ array<index_type, Rank> start_k, end_k;
+ for (int i = 0; i < Rank; i++) {
+ start_k[i] = ranges[i].first;
+ end_k[i] = ranges[i].second;
+ }
+
+ auto kernel = raw::copy_with_reflection101<T, Rank>;
+ auto policy = make_policy(kernel, output.size(), 0, stream);
+ launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
+ }
+
+ GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
+
+ template <class T>
+ void copy_with_reflection101(
+ const Stream& stream,
+ TensorSpan<T> output, TensorView<T> input,
+ std::vector<std::pair<std::size_t, std::size_t>> ranges)
+ {
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == ranges.size());
+
+ /* squeezable axes at the begining of both tensors can be eliminated
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+ * output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
+ * The padding operation essentially copies items from the input tensor to new locations in the output tensor
+ * and pads the remaining.
+ *
+ * If the size of the first axis of the input and output tensor is unity, the input and output indices
+ * for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
+ * there cannot be extra padding since the axes have unit size. The first index does not contribute to the
+ * element's address calculation and hence does nothing apart from eating up few cycles.
+ */
+ while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+ CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
+
+ input.squeeze(0);
+ output.squeeze(0);
+ ranges.erase(std::begin(ranges));
+
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == ranges.size());
+ }
+
+ auto inShape = input.shape_as_vector();
+ auto outShape = output.shape_as_vector();
+
+ /* contiguous axes which do not have any padding can be combined into one axis
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
+ * padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+ *
+ * Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
+ * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+ * a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
+ * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+ */
+ for (int i = 0; i < inShape.size(); i++) {
+ /* check if axis `i` requires any padding */
+ if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
+ /* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
+ CV_Assert(inShape[i] == outShape[i]);
+
+ /* we now iterate through the axes which follow and try to merge */
+ int j = i + 1; /* `j` is the axis which we will attempt to merge */
+ while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
+ CV_Assert(inShape[j] == outShape[j]);
+
+ /* `j` is also unpadded; merge `i` and `j` */
+ auto new_size = inShape[i] * inShape[j];
+ inShape[i] = new_size;
+ outShape[i] = new_size;
+ ranges[i].second = new_size;
+
+ /* delete axis `j` */
+ inShape.erase(std::begin(inShape) + j);
+ outShape.erase(std::begin(outShape) + j);
+ ranges.erase(std::begin(ranges) + j);
+
+ /* optimizations should not break the invariants */
+ CV_Assert(inShape.size() == outShape.size());
+ CV_Assert(inShape.size() == ranges.size());
+ CV_Assert(inShape[i] == outShape[i]);
+ CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
+ }
+ }
+ }
+
+ auto rank = inShape.size();
+
+ std::vector<std::size_t> inStride(rank), outStride(rank);
+ inStride.back() = 1;
+ outStride.back() = 1;
+ /* garbage, ..., garbage, 1 */
+
+ std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+ std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+ /* dim[0], dim[1], ..., dim[-1], 1 */
+
+ std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+ std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+ /* stride[0], stride[1], ..., stride[-2], 1 */
+
+ CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+ copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
+ }
+
+ template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+ template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t Rank>
+ __global__ void permute(
+ array<index_type, Rank> axis_order,
+ Span<T> output, array<size_type, Rank> outStrides,
+ View<T> input, array<size_type, Rank> inStrides)
+ {
+ for (auto i : grid_stride_range(input.size())) {
+ index_type oldPosition = 0;
+ index_type newPosition = i;
+
+ for (int j = 0; j < Rank; j++)
+ {
+ auto order = axis_order[j];
+ oldPosition += (newPosition / outStrides[j]) * inStrides[order];
+ newPosition %= outStrides[j];
+ }
+
+ output[i] = input[oldPosition];
+ }
+ }
+ }
+
+ template <class T, std::size_t Rank> static
+ void launch_permute_kernel(
+ const Stream& stream,
+ const std::vector<std::size_t>& order,
+ Span<T> output, const std::vector<std::size_t>& outStride,
+ View<T> input, const std::vector<std::size_t>& inStride)
+ {
+ CV_Assert(order.size() == Rank);
+ CV_Assert(outStride.size() == Rank);
+ CV_Assert(inStride.size() == Rank);
+
+ array<index_type, Rank> order_k;
+ order_k.assign(std::begin(order), std::end(order));
+
+ array<size_type, Rank> outStride_k, inStride_k;
+ outStride_k.assign(std::begin(outStride), std::end(outStride));
+ inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+ auto kernel = raw::permute<T, Rank>;
+ auto policy = make_policy(kernel, input.size(), 0, stream);
+ launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
+ }
+
+ GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
+
+ template <class T>
+ void permute(
+ const Stream& stream,
+ TensorSpan<T> output, TensorView<T> input,
+ std::vector<std::size_t> order)
+ {
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(input.rank() == order.size());
+ CV_Assert(input.size() == output.size());
+
+ /* squeezable axes at the begining of both tensors which aren't permuted can be eliminated
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+ * output tensor will be some permutation of the input tensor indices. Let the output
+ * tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
+ * from the input tensor to new locations in the output tensor as dictated by the indices.
+ *
+ * If the size of the first axis of the input and output tensor is one and these axes are
+ * not involved in any permutation, i.e. order[0] = 0, the input and output indicies for
+ * all the elements will be of the form be [0, i2, ...] and [0, o2, ...] respectively.
+ * The first index does not contribute to the element's address calculation and hence does
+ * nothing apart from eating up few cycles.
+ */
+ while (order[0] == 0 && input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+ /* remove the axes */
+ input.squeeze(0);
+ output.squeeze(0);
+
+ /* when we remove axis zero, the axis index will be one less than the previous index
+ * for the remaining axes
+ */
+ order.erase(order.begin());
+ for (auto& axis : order)
+ axis--;
+
+ /* optimizations should not break the invariants */
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(input.rank() == order.size());
+ CV_Assert(input.size() == output.size());
+ }
+
+ auto rank = output.rank();
+ auto inShape = input.shape_as_vector();
+ auto outShape = output.shape_as_vector();
+
+ std::vector<std::size_t> inStride(rank), outStride(rank);
+ inStride.back() = 1;
+ outStride.back() = 1;
+ /* garbage, ..., garbage, 1 */
+
+ std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+ std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+ /* dim[0], dim[1], ..., dim[-1], 1 */
+
+ std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+ std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+ /* stride[0], stride[1], ..., stride[-2], 1 */
+
+ CV_Assert(2 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+ permute_dispatcher<T, 2, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
+ }
+
+ template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+ template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, bool Normalize>
+ __global__ void prior_box(
+ Span<T> output,
+ View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+ size_type layerWidth, size_type layerHeight,
+ size_type imageWidth, size_type imageHeight)
+ {
+ /* each box consists of two pair of coordinates and hence 4 values in total */
+ /* since the entire output consists (first channel at least) of these boxes,
+ * we are garunteeed that the output is aligned to a boundary of 4 values
+ */
+ using vector_type = get_vector_type_t<T, 4>;
+ auto output_vPtr = vector_type::get_pointer(output.data());
+
+ /* num_points contains the number of points in the feature map of interest
+ * each iteration of the stride loop selects a point and generates prior boxes for it
+ */
+ size_type num_points = layerWidth * layerHeight;
+ for (auto idx : grid_stride_range(num_points)) {
+ const index_type x = idx % layerWidth,
+ y = idx / layerWidth;
+
+ index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
+ for (int i = 0; i < boxWidth.size(); i++) {
+ for (int j = 0; j < offsetX.size(); j++) {
+ float center_x = (x + offsetX[j]) * stepX;
+ float center_y = (y + offsetY[j]) * stepY;
+
+ vector_type vec;
+ if(Normalize) {
+ vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
+ vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
+ vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
+ vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
+ } else {
+ vec.data[0] = center_x - boxWidth[i] * 0.5f;
+ vec.data[1] = center_y - boxHeight[i] * 0.5f;
+ vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
+ vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
+ }
+
+ v_store(output_vPtr[output_offset_v4], vec);
+ output_offset_v4++;
+ }
+ }
+ }
+ }
+
+ template <class T>
+ __global__ void prior_box_clip(Span<T> output) {
+ for (auto i : grid_stride_range(output.size())) {
+ using device::clamp;
+ output[i] = clamp<T>(output[i], 0.0, 1.0);
+ }
+ }
+
+ template <class T>
+ __global__ void prior_box_set_variance1(Span<T> output, float variance) {
+ using vector_type = get_vector_type_t<T, 4>;
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ for (auto i : grid_stride_range(output.size() / 4)) {
+ vector_type vec;
+ for (int j = 0; j < 4; j++)
+ vec.data[j] = variance;
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T>
+ __global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
+ using vector_type = get_vector_type_t<T, 4>;
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ for (auto i : grid_stride_range(output.size() / 4)) {
+ vector_type vec;
+ for(int j = 0; j < 4; j++)
+ vec.data[j] = variance[j];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+ }
+
+ template <class T, bool Normalize> static
+ void launch_prior_box_kernel(
+ const Stream& stream,
+ Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+ std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
+ {
+ auto num_points = layerWidth * layerHeight;
+ auto kernel = raw::prior_box<T, Normalize>;
+ auto policy = make_policy(kernel, num_points, 0, stream);
+ launch_kernel(kernel, policy,
+ output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+ layerWidth, layerHeight, imageWidth, imageHeight);
+ }
+
+ template <class T>
+ void generate_prior_boxes(
+ const Stream& stream,
+ Span<T> output,
+ View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+ std::vector<float> variance,
+ std::size_t numPriors,
+ std::size_t layerWidth, std::size_t layerHeight,
+ std::size_t imageWidth, std::size_t imageHeight,
+ bool normalize, bool clip)
+ {
+ if (normalize) {
+ launch_prior_box_kernel<T, true>(
+ stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+ layerWidth, layerHeight, imageWidth, imageHeight
+ );
+ } else {
+ launch_prior_box_kernel<T, false>(
+ stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+ layerWidth, layerHeight, imageWidth, imageHeight
+ );
+ }
+
+ std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
+ CV_Assert(channel_size * 2 == output.size());
+
+ if (clip) {
+ auto output_span_c1 = Span<T>(output.data(), channel_size);
+ auto kernel = raw::prior_box_clip<T>;
+ auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
+ launch_kernel(kernel, policy, output_span_c1);
+ }
+
+ auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
+ if (variance.size() == 1) {
+ auto kernel = raw::prior_box_set_variance1<T>;
+ auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+ launch_kernel(kernel, policy, output_span_c2, variance[0]);
+ } else {
+ array<float, 4> variance_k;
+ variance_k.assign(std::begin(variance), std::end(variance));
+ auto kernel = raw::prior_box_set_variance4<T>;
+ auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+ launch_kernel(kernel, policy, output_span_c2, variance_k);
+ }
+ }
+
+ template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
+ std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+ template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
+ std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "limits.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T>
+ __global__ void sigmoid_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset) {
+ /* - the input is divided into equal blocks strided by `stride`
+ * - we must apply sigmoid to a continuous range of `n` values starting from `offset` in every block
+ */
+ for (auto i : grid_stride_range(n * output.size() / stride)) {
+ auto block_idx = i / n;
+ auto index = block_idx * stride + offset + (i % n);
+
+ using device::sigmoid;
+ output[index] = sigmoid(input[index]);
+ }
+ }
+
+ template <class T>
+ __global__ void softmax_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset_) {
+ for (auto idx : grid_stride_range(output.size() / stride)) {
+ index_type offset = idx * stride + offset_;
+
+ auto largest = numeric_limits<T>::lowest();
+ for (int i = 0; i < n; i++) {
+ using device::max;
+ largest = max(largest, output[offset + i]);
+ }
+
+ auto sum = T(0);
+ for (int i = 0; i < n; i++) {
+ using device::exp;
+ auto temp = exp(output[offset + i] - largest);
+ sum += temp;
+ output[offset + i] = temp;
+ }
+
+ for (int i = 0; i < n; i++) {
+ output[offset + i] /= sum;
+ }
+ }
+ }
+
+ template <class T>
+ __global__ void region_finalize(Span<T> output, View<T> input, View<T> bias,
+ T object_prob_cutoff, T class_prob_cutoff,
+ size_type height_norm, size_type width_norm,
+ size_type rows, size_type cols,
+ size_type boxes_per_cell,
+ size_type box_size,
+ size_type classes)
+ {
+ for (auto box_index : grid_stride_range(output.size() / box_size)) {
+ auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
+ auto box_offset = box_index * box_size;
+
+ auto batch_inner_size = rows * cols * boxes_per_cell;
+ auto row_inner_size = cols * boxes_per_cell;
+ auto col_inner_size = boxes_per_cell;
+
+ auto y = (box_index % batch_inner_size) / row_inner_size;
+ auto x = (box_index % row_inner_size) / col_inner_size;
+
+ using device::sigmoid;
+ using device::exp;
+ output[box_offset + 0] = (T(x) + sigmoid(input[box_offset + 0])) / T(cols);
+ output[box_offset + 1] = (T(y) + sigmoid(input[box_offset + 1])) / T(rows);
+ output[box_offset + 2] = exp(input[box_offset + 2]) * bias[2 * box_of_the_cell + 0] / T(width_norm);
+ output[box_offset + 3] = exp(input[box_offset + 3]) * bias[2 * box_of_the_cell + 1] / T(height_norm);
+
+ /* squash objectness score into a probability */
+ using device::sigmoid;
+ T objectness_prob = sigmoid(output[box_offset + 4]);
+ output[box_offset + 4] = objectness_prob;
+
+ /* ignore prediction if the objectness probability is less than the cutoff */
+ if (objectness_prob < object_prob_cutoff)
+ objectness_prob = 0;
+
+ /* the class probabilities we have currently are conditional class probabilities
+ * given the object
+ *
+ * to obtain the actual class probability, we multiply the conditional probability
+ * with the object probability
+ */
+ const index_type class_begin = box_offset + 5; /* 4 box coordinates, 1 obj prob, class probs... */
+ const index_type class_end = class_begin + classes;
+ index_type offset = class_begin;
+
+ using vector_type = get_vector_type_t<T, 4>;
+
+ /* process each class independently until the offset is aligned to an n-element boundary */
+ while (offset % vector_type::size() != 0 && offset < class_end) {
+ T actual_class_prob = objectness_prob * output[offset];
+ if (actual_class_prob <= class_prob_cutoff)
+ actual_class_prob = T(0);
+ output[offset] = actual_class_prob;
+ offset++;
+ }
+
+ auto output_vPtr = vector_type::get_pointer(output.data() + offset);
+ auto input_vPtr = vector_type::get_pointer(input.data() + offset);
+ for (int i = 0; (offset + vector_type::size()) < class_end; i++) {
+ vector_type vec;
+ v_load(vec, output_vPtr[i]);
+ for (int j = 0; j < vector_type::size(); j++) {
+ T actual_class_prob = objectness_prob * vec.data[j];
+ if (actual_class_prob <= class_prob_cutoff)
+ actual_class_prob = T(0);
+ vec.data[j] = actual_class_prob;
+ }
+ v_store(output_vPtr[i], vec);
+ offset += vector_type::size();
+ }
+
+ /* process the remaining classes */
+ while (offset < class_end) {
+ T actual_class_prob = objectness_prob * output[offset];
+ if (actual_class_prob <= class_prob_cutoff)
+ actual_class_prob = T(0);
+ output[offset] = actual_class_prob;
+ offset++;
+ }
+ }
+ }
+ }
+
+ template <class T>
+ void sigmoid_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) {
+ CV_Assert(output.size() % stride == 0);
+
+ auto kernel = raw::sigmoid_strided<T>;
+ auto policy = make_policy(kernel, n * output.size() / stride, 0, stream);
+ launch_kernel(kernel, policy, output, input, n, stride, offset);
+ }
+
+ template void sigmoid_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t);
+ template void sigmoid_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t);
+
+ template <class T>
+ void softmax_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) {
+ CV_Assert(output.size() % stride == 0);
+
+ auto kernel = raw::softmax_strided<T>;
+ auto policy = make_policy(kernel, output.size() / stride, 0, stream);
+ launch_kernel(kernel, policy, output, input, n, stride, offset);
+ }
+
+ template void softmax_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t);
+ template void softmax_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t);
+
+ template <class T>
+ void region_finalize(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
+ T object_prob_cutoff, T class_prob_cutoff,
+ std::size_t height_norm, std::size_t width_norm,
+ std::size_t rows, std::size_t cols,
+ std::size_t boxes_per_cell,
+ std::size_t box_size,
+ std::size_t classes)
+ {
+ CV_Assert(output.size() % box_size == 0);
+
+ auto kernel = raw::region_finalize<T>;
+ auto policy = make_policy(kernel, output.size() / box_size, 0, stream);
+ launch_kernel(kernel, policy, output, input, bias,
+ object_prob_cutoff, class_prob_cutoff,
+ height_norm, width_norm,
+ rows, cols, boxes_per_cell, box_size, classes);
+ }
+
+ template void region_finalize(const Stream&, Span<__half>, View<__half>, View<__half>,
+ __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t);
+
+ template void region_finalize(const Stream&, Span<float>, View<float>, View<float>,
+ float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T>
+ __global__ void resize_nn(
+ Span<T> output, size_type out_height, size_type out_width,
+ View<T> input, size_type in_height, size_type in_width)
+ {
+ auto in_image_size = in_height * in_width;
+ auto out_image_size = out_height * out_width;
+
+ /* o2i = output to input */
+ auto o2i_fx = static_cast<float>(in_width) / out_width;
+ auto o2i_fy = static_cast<float>(in_height) / out_height;
+
+ /* think of the output and input as a collection of 2d images with the last axis
+ * representing the width and the last but one axis representing the height
+ *
+ * the remaining axis together form a collection of these images
+ */
+ for (auto idx : grid_stride_range(output.size())) {
+ const index_type n = idx / out_image_size;
+ const index_type x = (idx % out_image_size) % out_width;
+ const index_type y = (idx % out_image_size) / out_width;
+
+ auto in_x = static_cast<index_type>(x * o2i_fx);
+ auto in_y = static_cast<index_type>(y * o2i_fy);
+
+ index_type in_idx = n * in_image_size + in_y * in_width + in_x;
+ output[idx] = input[in_idx];
+ }
+ }
+
+ template <class T>
+ __global__ void resize_bilinear(
+ Span<T> output, size_type out_height, size_type out_width,
+ View<T> input, size_type in_height, size_type in_width,
+ float o2i_fy, float o2i_fx)
+ {
+ auto in_image_size = in_height * in_width;
+ auto out_image_size = out_height * out_width;
+
+ /* think of the output and input as a collection of 2d images with the last axis
+ * representing the width and the last but one axis representing the height
+ *
+ * the remaining axis together form a collection of these images
+ */
+ for (auto idx : grid_stride_range(output.size())) {
+ const index_type n = idx / out_image_size;
+ const index_type x = (idx % out_image_size) % out_width;
+ const index_type y = (idx % out_image_size) / out_width;
+
+ auto in_x = x * o2i_fx;
+ auto in_y = y * o2i_fy;
+
+ auto in_x0 = static_cast<index_type>(in_x);
+ auto in_y0 = static_cast<index_type>(in_y);
+
+ using device::min;
+ auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+ auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+ const index_type in_offset_r0 = n * in_image_size + in_y0 * in_width;
+ const index_type in_offset_r1 = n * in_image_size + in_y1 * in_width;
+
+ auto v_00 = input[in_offset_r0 + in_x0],
+ v_01 = input[in_offset_r0 + in_x1],
+ v_10 = input[in_offset_r1 + in_x0],
+ v_11 = input[in_offset_r1 + in_x1];
+
+ output[idx] =
+ v_00 +
+ T(in_y - in_y0) * T(v_10 - v_00) +
+ T(in_x - in_x0) * T(v_01 - v_00) +
+ T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
+ }
+ }
+ }
+
+ template <class T>
+ void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) {
+ auto in_height = input.get_axis_size(-2);
+ auto in_width = input.get_axis_size(-1);
+
+ auto out_height = output.get_axis_size(-2);
+ auto out_width = output.get_axis_size(-1);
+
+ auto kernel = raw::resize_nn<T>;
+ auto policy = make_policy(kernel, output.size(), 0, stream);
+ launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width);
+ }
+
+ template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>);
+ template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>);
+
+ template <class T>
+ void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) {
+ auto in_height = input.get_axis_size(-2);
+ auto in_width = input.get_axis_size(-1);
+
+ auto out_height = output.get_axis_size(-2);
+ auto out_width = output.get_axis_size(-1);
+
+ auto kernel = raw::resize_bilinear<T>;
+ auto policy = make_policy(kernel, output.size(), 0, stream);
+ launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+ }
+
+ template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float);
+ template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t N>
+ __global__ void bias1_vec(Span<T> output, View<T> input, T beta) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vec.size(); j++)
+ vec.data[j] = vec.data[j] + beta;
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ inner_size /= vector_type::size();
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for(int j = 0; j < vec.size(); j++)
+ vec.data[j] = vec.data[j] + bias[bias_idx];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void scale1_vec(Span<T> output, View<T> input, T alpha) {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vec.size(); j++)
+ vec.data[j] = vec.data[j] * alpha;
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
+ {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ inner_size /= vector_type::size();
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vec.size(); j++)
+ vec.data[j] = vec.data[j] * weights[scale_idx];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
+ {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vec.size(); j++)
+ vec.data[j] = alpha * vec.data[j] + beta;
+ v_store(output_vPtr[i], vec);
+ }
+ }
+
+ template <class T, std::size_t N>
+ __global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
+ {
+ using vector_type = get_vector_type_t<T, N>;
+
+ auto output_vPtr = vector_type::get_pointer(output.data());
+ auto input_vPtr = vector_type::get_pointer(input.data());
+
+ inner_size /= vector_type::size();
+ for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+ const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
+
+ vector_type vec;
+ v_load(vec, input_vPtr[i]);
+ for (int j = 0; j < vec.size(); j++)
+ vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
+ v_store(output_vPtr[i], vec);
+ }
+ }
+ }
+
+ template <class T, std::size_t N> static
+ void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::bias1_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, beta);
+ }
+
+ template <class T>
+ void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) {
+ CV_Assert(is_shape_same(input, output));
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_bias1_vec_kernel<T, 4>(stream, output, input, beta);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_bias1_vec_kernel<T, 2>(stream, output, input, beta);
+ } else {
+ launch_bias1_vec_kernel<T, 1>(stream, output, input, beta);
+ }
+ }
+
+ template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
+ template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
+
+ template <class T, std::size_t N> static
+ void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+ CV_Assert(inner_size % N == 0);
+
+ auto kernel = raw::biasN_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, inner_size, bias);
+ }
+
+ template <class T>
+ void biasN(
+ const Stream& stream,
+ TensorSpan<T> output,
+ TensorView<T> input, std::size_t inner_size,
+ TensorView<T> bias)
+ {
+ CV_Assert(is_shape_same(input, output));
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+ launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+ launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
+ } else {
+ launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
+ }
+ }
+
+ template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+ template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+ template <class T, std::size_t N> static
+ void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::scale1_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, alpha);
+ }
+
+ template <class T>
+ void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) {
+ CV_Assert(is_shape_same(input, output));
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha);
+ } else {
+ launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha);
+ }
+ }
+
+ template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
+ template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
+
+ template <class T, std::size_t N> static
+ void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+ CV_Assert(inner_size % N == 0);
+
+ auto kernel = raw::scaleN_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, inner_size, weights);
+ }
+
+ template <class T>
+ void scaleN(
+ const Stream& stream,
+ TensorSpan<T> output,
+ TensorView<T> input, std::size_t inner_size,
+ TensorView<T> weights)
+ {
+ CV_Assert(is_shape_same(input, output));
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+ launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+ launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
+ } else {
+ launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
+ }
+ }
+
+ template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+ template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+ template <class T, std::size_t N> static
+ void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+
+ auto kernel = raw::scale1_with_bias1_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, alpha, beta);
+ }
+
+ template <class T>
+ void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+ CV_Assert(output.size() == input.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+ launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+ launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
+ } else {
+ launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
+ }
+ }
+
+ template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+ template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
+
+ template <class T, std::size_t N> static
+ void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
+ CV_Assert(is_fully_aligned<T>(output, N));
+ CV_Assert(is_fully_aligned<T>(input, N));
+ CV_Assert(inner_size % N == 0);
+
+ auto kernel = raw::scaleN_with_biasN_vec<T, N>;
+ auto policy = make_policy(kernel, output.size() / N, 0, stream);
+ launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
+ }
+
+ template <class T>
+ void scaleN_with_biasN(
+ const Stream& stream,
+ TensorSpan<T> output,
+ TensorView<T> input, std::size_t inner_size,
+ TensorView<T> weights, TensorView<T> bias)
+ {
+ CV_Assert(is_shape_same(input, output));
+ CV_Assert(weights.size() == bias.size());
+
+ if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+ launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
+ } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+ launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
+ } else {
+ launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
+ }
+ }
+
+ template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
+ template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <iostream>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ namespace raw {
+ template <class T, std::size_t Rank>
+ __global__ void slice(
+ Span<T> output, array<size_type, Rank> out_strides,
+ View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
+ {
+ for (auto i : grid_stride_range(output.size())) {
+ index_type out_index = i / out_strides[0];
+ index_type in_index = in_offset[0] + out_index;
+ index_type iidx = in_index * in_strides[0];
+ for (int j = 1; j < Rank; j++) {
+ out_index = (i % out_strides[j - 1]) / out_strides[j];
+ in_index = in_offset[j] + out_index;
+ iidx += in_index * in_strides[j];
+ }
+
+ output[i] = input[iidx];
+ }
+ }
+ }
+
+ template <class T, std::size_t Rank> static
+ void launch_slice(
+ const Stream& stream,
+ Span<T> output, const std::vector<std::size_t>& outStride,
+ View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
+ {
+ CV_Assert(outStride.size() == Rank);
+ CV_Assert(inStride.size() == Rank);
+ CV_Assert(inOffset.size() == Rank);
+
+ array<size_type, Rank> outStride_k, inStride_k;
+ outStride_k.assign(std::begin(outStride), std::end(outStride));
+ inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+ array<index_type, Rank> inOffset_k;
+ inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
+
+ auto kernel = raw::slice<T, Rank>;
+ auto policy = make_policy(kernel, output.size(), 0, stream);
+ launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
+ }
+
+ GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
+
+ template <class T>
+ void slice(const Stream& stream,
+ TensorSpan<T> output, TensorView<T> input,
+ std::vector<std::size_t> offsets)
+ {
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == offsets.size());
+
+ /* squeezable axes at the begining of both tensors can be eliminated
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
+ * tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are igored.
+ *
+ * If the size of the first axis of the input and output tensor is unity, the input and output indices
+ * for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
+ * there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
+ * element's address calculation and hence does nothing apart from eating up few cycles.
+ */
+ while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+ CV_Assert(offsets[0] == 0);
+
+ input.squeeze(0);
+ output.squeeze(0);
+ offsets.erase(std::begin(offsets));
+
+ CV_Assert(output.rank() == input.rank());
+ CV_Assert(output.rank() == offsets.size());
+ }
+
+ auto inShape = input.shape_as_vector();
+ auto outShape = output.shape_as_vector();
+
+ /* contiguous axes which do not undergo slicing can be combined into one axis
+ *
+ * Reasoning:
+ * ----------
+ * Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
+ * slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
+ *
+ * Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
+ * the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
+ * a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
+ * Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
+ */
+ for (int i = 0; i < inShape.size(); i++) {
+ /* check if axis `i` requires any slicing */
+ if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+ /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+ int j = i + 1; /* `j` is the axis which we will attempt to merge */
+ while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+ /* `j` axis is also unsliced; merge `i` and `j` */
+ auto new_size = inShape[i] * inShape[j];
+ inShape[i] = new_size;
+ outShape[i] = new_size;
+ offsets[i] = 0; /* redundant */
+
+ /* delete axis `j` */
+ inShape.erase(std::begin(inShape) + j);
+ outShape.erase(std::begin(outShape) + j);
+ offsets.erase(std::begin(offsets) + j);
+
+ /* optimizations should not break the invariants */
+ CV_Assert(inShape.size() == outShape.size());
+ CV_Assert(inShape.size() == offsets.size());
+ CV_Assert(inShape[i] == outShape[i]);
+ CV_Assert(offsets[i] == 0);
+ }
+ }
+ }
+
+ auto rank = inShape.size();
+
+ std::vector<std::size_t> inStride(rank), outStride(rank);
+ inStride.back() = 1;
+ outStride.back() = 1;
+ /* garbage, ..., garbage, 1 */
+
+ std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+ std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+ /* dim[0], dim[1], ..., dim[-1], 1 */
+
+ std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+ std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+ /* stride[0], stride[1], ..., stride[-2], 1 */
+
+ CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+ slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
+ }
+
+ template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+ template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+++ /dev/null
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// this file is a stub and will be removed once actual code is added
-
-#include "../precomp.hpp"
-
-#include <cuda_runtime.h>
-
-#ifndef HAVE_CUDA
-# error "CUDA files should not be compiled if CUDA was not enabled"
-#endif
-
-__global__ void cuda4dnn_build_test_kernel(float* addr) {
- int idx = threadIdx.x;
- addr[idx] = 0.0;
-}
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
+#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
+
+#include <cstdint>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ /* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
+ * Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
+ *
+ * If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
+ */
+#ifdef __CUDACC__
+ using size_type = int;
+ using index_type = int;
+#else
+ using size_type = std::int32_t;
+ using index_type = std::int32_t;
+#endif
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include "../cuda4dnn/csl/pointer.hpp"
+
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+ /** \file vector_traits.hpp
+ * \brief utility classes and functions for vectorized memory loads/stores
+ *
+ * Example:
+ * using vector_type = get_vector_type_t<float, 4>;
+ *
+ * auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
+ * auto output_vPtr = type::get_pointer(optr); // optr is of type DevicePtr<float>
+ *
+ * vector_type vec;
+ * v_load(vec, input_vPtr);
+ *
+ * for(int i = 0; i < vector_type::size(); i++)
+ * vec[i] = do_something(vec[i]);
+ *
+ * v_store(output_vPtr, vec);
+ */
+
+ namespace detail {
+ template <size_type N> struct raw_type_ { };
+ template <> struct raw_type_<256> { typedef ulonglong4 type; };
+ template <> struct raw_type_<128> { typedef uint4 type; };
+ template <> struct raw_type_<64> { typedef uint2 type; };
+ template <> struct raw_type_<32> { typedef uint1 type; };
+ template <> struct raw_type_<16> { typedef uchar2 type; };
+ template <> struct raw_type_<8> { typedef uchar1 type; };
+
+ template <size_type N> struct raw_type {
+ using type = typename raw_type_<N>::type;
+ static_assert(sizeof(type) * 8 == N, "");
+ };
+ }
+
+ /* \tparam T type of element in the vector
+ * \tparam N "number of elements" of type T in the vector
+ */
+ template <class T, size_type N>
+ union vector_type {
+ using value_type = T;
+ using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
+
+ __device__ vector_type() { }
+
+ __device__ static constexpr size_type size() { return N; }
+
+ raw_type raw;
+ T data[N];
+
+ template <class U> static __device__
+ typename std::enable_if<std::is_const<U>::value, const vector_type*>
+ ::type get_pointer(csl::DevicePtr<U> ptr) {
+ return reinterpret_cast<const vector_type*>(ptr.get());
+ }
+
+ template <class U> static __device__
+ typename std::enable_if<!std::is_const<U>::value, vector_type*>
+ ::type get_pointer(csl::DevicePtr<U> ptr) {
+ return reinterpret_cast<vector_type*>(ptr.get());
+ }
+ };
+
+ template <class V>
+ __device__ void v_load(V& dest, const V& src) {
+ dest.raw = src.raw;
+ }
+
+ template <class V>
+ __device__ void v_load(V& dest, const V* src) {
+ dest.raw = src->raw;
+ }
+
+ template <class V>
+ __device__ void v_store(V* dest, const V& src) {
+ dest->raw = src.raw;
+ }
+
+ template <class V>
+ __device__ void v_store(V& dest, const V& src) {
+ dest.raw = src.raw;
+ }
+
+ template <class T, size_type N>
+ struct get_vector_type {
+ typedef vector_type<T, N> type;
+ };
+
+ template <class T, size_type N>
+ using get_vector_type_t = typename get_vector_type<T, N>::type;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+
+#include "error.hpp"
+#include "stream.hpp"
+#include "pointer.hpp"
+#include "fp16.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cublas_v2.h>
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#define CUDA4DNN_CHECK_CUBLAS(call) \
+ ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
+
+ /** @brief exception class for errors thrown by the cuBLAS API */
+ class cuBLASException : public CUDAException {
+ public:
+ using CUDAException::CUDAException;
+ };
+
+ namespace detail {
+ static void check(cublasStatus_t status, const char* func, const char* file, int line) {
+ auto cublasGetErrorString = [](cublasStatus_t err) {
+ switch (err) {
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+ case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+ }
+ return "UNKNOWN_CUBLAS_ERROR";
+ };
+
+ if (status != CUBLAS_STATUS_SUCCESS)
+ throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
+ }
+ }
+
+ /** noncopyable cuBLAS smart handle
+ *
+ * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
+ * is destroyed after use. The handle can be associated with a CUDA stream by specifying the
+ * stream during construction. By default, the handle is associated with the default stream.
+ */
+ class UniqueHandle {
+ public:
+ UniqueHandle() { CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); }
+ UniqueHandle(UniqueHandle&) = delete;
+ UniqueHandle(UniqueHandle&& other) noexcept
+ : stream(std::move(other.stream)), handle{ other.handle } {
+ other.handle = nullptr;
+ }
+
+ UniqueHandle(Stream strm) : stream(std::move(strm)) {
+ CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
+ try {
+ CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
+ } catch (...) {
+ /* cublasDestroy won't throw if a valid handle is passed */
+ CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+ throw;
+ }
+ }
+
+ ~UniqueHandle() noexcept {
+ if (handle != nullptr) {
+ /* cublasDestroy won't throw if a valid handle is passed */
+ CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+ }
+ }
+
+ UniqueHandle& operator=(const UniqueHandle&) = delete;
+ UniqueHandle& operator=(UniqueHandle&& other) noexcept {
+ stream = std::move(other.stream);
+ handle = other.handle;
+ other.handle = nullptr;
+ return *this;
+ }
+
+ /** @brief returns the raw cuBLAS handle */
+ cublasHandle_t get() const noexcept { return handle; }
+
+ private:
+ Stream stream;
+ cublasHandle_t handle;
+ };
+
+ /** @brief sharable cuBLAS smart handle
+ *
+ * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
+ * is destroyed after all references to the handle are destroyed. The handle can be
+ * associated with a CUDA stream by specifying the stream during construction. By default,
+ * the handle is associated with the default stream.
+ *
+ * @note Moving a Handle object to another invalidates the former
+ */
+ class Handle {
+ public:
+ Handle() : handle(std::make_shared<UniqueHandle>()) { }
+ Handle(const Handle&) = default;
+ Handle(Handle&&) = default;
+ Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
+
+ Handle& operator=(const Handle&) = default;
+ Handle& operator=(Handle&&) = default;
+
+ /** returns true if the handle is valid */
+ explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+ cublasHandle_t get() const noexcept {
+ CV_Assert(handle);
+ return handle->get();
+ }
+
+ private:
+ std::shared_ptr<UniqueHandle> handle;
+ };
+
+ /** @brief GEMM for colummn-major matrices
+ *
+ * \f$ C = \alpha AB + \beta C \f$
+ *
+ * @tparam T matrix element type (must be `half` or `float`)
+ *
+ * @param handle valid cuBLAS Handle
+ * @param transa use transposed matrix of A for computation
+ * @param transb use transposed matrix of B for computation
+ * @param rows_c number of rows in C
+ * @param cols_c number of columns in C
+ * @param common_dim common dimension of A (or trans A) and B (or trans B)
+ * @param alpha scale factor for AB
+ * @param[in] A pointer to column-major matrix A in device memory
+ * @param lda leading dimension of matrix A
+ * @param[in] B pointer to column-major matrix B in device memory
+ * @param ldb leading dimension of matrix B
+ * @param beta scale factor for C
+ * @param[in,out] C pointer to column-major matrix C in device memory
+ * @param ldc leading dimension of matrix C
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void gemm(const Handle& handle,
+ bool transa, bool transb,
+ std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+ T alpha, const DevicePtr<const T> A, std::size_t lda,
+ const DevicePtr<const T> B, std::size_t ldb,
+ T beta, const DevicePtr<T> C, std::size_t ldc);
+
+ template <> inline
+ void gemm<half>(const Handle& handle,
+ bool transa, bool transb,
+ std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+ half alpha, const DevicePtr<const half> A, std::size_t lda,
+ const DevicePtr<const half> B, std::size_t ldb,
+ half beta, const DevicePtr<half> C, std::size_t ldc)
+ {
+ CV_Assert(handle);
+
+ auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+ opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+ int irows_c = static_cast<int>(rows_c),
+ icols_c = static_cast<int>(cols_c),
+ icommon_dim = static_cast<int>(common_dim),
+ ilda = static_cast<int>(lda),
+ ildb = static_cast<int>(ldb),
+ ildc = static_cast<int>(ldc);
+
+ CUDA4DNN_CHECK_CUBLAS(
+ cublasHgemm(
+ handle.get(),
+ opa, opb,
+ irows_c, icols_c, icommon_dim,
+ &alpha, A.get(), ilda,
+ B.get(), ildb,
+ &beta, C.get(), ildc
+ )
+ );
+ }
+
+ template <> inline
+ void gemm<float>(const Handle& handle,
+ bool transa, bool transb,
+ std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+ float alpha, const DevicePtr<const float> A, std::size_t lda,
+ const DevicePtr<const float> B, std::size_t ldb,
+ float beta, const DevicePtr<float> C, std::size_t ldc)
+ {
+ CV_Assert(handle);
+
+ auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+ opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+ int irows_c = static_cast<int>(rows_c),
+ icols_c = static_cast<int>(cols_c),
+ icommon_dim = static_cast<int>(common_dim),
+ ilda = static_cast<int>(lda),
+ ildb = static_cast<int>(ldb),
+ ildc = static_cast<int>(ldc);
+
+ CUDA4DNN_CHECK_CUBLAS(
+ cublasSgemm(
+ handle.get(),
+ opa, opb,
+ irows_c, icols_c, icommon_dim,
+ &alpha, A.get(), ilda,
+ B.get(), ildb,
+ &beta, C.get(), ildc
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
+
+#include "cudnn/cudnn.hpp"
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ /** describe convolution filters
+ *
+ * @tparam T type of elements in the kernels
+ */
+ template <class T>
+ class FilterDescriptor {
+ public:
+ FilterDescriptor() noexcept : descriptor{ nullptr } { }
+ FilterDescriptor(const FilterDescriptor&) = delete;
+ FilterDescriptor(FilterDescriptor&& other) noexcept
+ : descriptor{ other.descriptor } {
+ other.descriptor = nullptr;
+ }
+
+ /** constructs a filter descriptor from the filter dimensions provided in \p shape
+ *
+ * Shape dimensions:
+ * 0: number of filters
+ * 1: number of input feature maps
+ * 2..n: kernel dimensions
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+ FilterDescriptor(const SequenceContainer& shape) {
+ constructor(shape.begin(), shape.end());
+ }
+
+ /** constructs a filter descriptor from the filter dimensions provided in [begin, end)
+ *
+ * Shape dimensions:
+ * 0: number of filters
+ * 1: number of input feature maps
+ * 2..n: kernel dimensions
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
+ FilterDescriptor(ForwardItr begin, ForwardItr end) {
+ constructor(begin, end);
+ }
+
+ /** constructs a filter descriptor from the filter dimensions provided as arguments
+ *
+ * Shape dimensions:
+ * 0: number of filters
+ * 1: number of input feature maps
+ * 2..n: kernel dimensions
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ...Sizes>
+ FilterDescriptor(Sizes ...sizes) {
+ static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
+ static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
+ std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
+ constructor(std::begin(dims), std::end(dims));
+ }
+
+ ~FilterDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+ }
+ }
+
+ FilterDescriptor& operator=(const FilterDescriptor&) = delete;
+ FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
+
+ private:
+ template <class ForwardItr>
+ void constructor(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) >= 3);
+ CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
+
+ CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
+ try {
+ const auto rank = std::distance(start, end);
+ if (rank == 4) {
+ std::array<int, 4> dims;
+ std::copy(start, end, std::begin(dims));
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetFilter4dDescriptor(
+ descriptor,
+ detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+ dims[0], dims[1], dims[2], dims[3]
+ )
+ );
+ } else {
+ std::vector<int> dims(start, end);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetFilterNdDescriptor(
+ descriptor,
+ detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+ dims.size(), dims.data()
+ )
+ );
+ }
+ } catch (...) {
+ /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnFilterDescriptor_t descriptor;
+ };
+
+ /** describes a convolution operation
+ *
+ * @tparam T type of element participating in convolution
+ */
+ template <class T>
+ class ConvolutionDescriptor {
+ public:
+ ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
+ ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
+ ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
+ : descriptor{ other.descriptor } {
+ other.descriptor = nullptr;
+ }
+
+ /** constructs a convolution descriptor
+ *
+ * Pre-conditions:
+ * - \p zero_padding, \p stride and \p dilation must have the same size
+ *
+ * The length of the containers is interpreted as the order of the convolution.
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+ ConvolutionDescriptor(
+ const SequenceContainer& zero_padding,
+ const SequenceContainer& stride,
+ const SequenceContainer& dilation,
+ std::size_t group_count)
+ {
+ constructor(zero_padding, stride, dilation, group_count);
+ }
+
+ ~ConvolutionDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+ }
+ }
+
+ ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
+ ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
+
+ private:
+ template <class SequenceContainer>
+ void constructor(
+ const SequenceContainer& zero_padding,
+ const SequenceContainer& stride,
+ const SequenceContainer& dilation,
+ std::size_t group_count)
+ {
+ CV_Assert(zero_padding.size() == stride.size());
+ CV_Assert(zero_padding.size() == dilation.size());
+
+ CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
+ try {
+ const auto rank = zero_padding.size();
+ if (rank == 2) {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetConvolution2dDescriptor(
+ descriptor,
+ zero_padding[0], zero_padding[1],
+ stride[0], stride[1],
+ dilation[0], dilation[1],
+ CUDNN_CROSS_CORRELATION,
+ detail::get_data_type<T>()
+ )
+ );
+ } else {
+ std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
+ std::vector<int> istride(std::begin(stride), std::end(stride));
+ std::vector<int> idilation(std::begin(dilation), std::end(dilation));
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetConvolutionNdDescriptor(
+ descriptor,
+ rank, ipadding.data(), istride.data(), idilation.data(),
+ CUDNN_CROSS_CORRELATION,
+ detail::get_data_type<T>()
+ )
+ );
+ }
+ CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
+ } catch (...) {
+ /* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnConvolutionDescriptor_t descriptor;
+ };
+
+ /** wrapper around a convolution algorithm
+ *
+ * @tparam T type of elements being convolved
+ */
+ template <class T>
+ class ConvolutionAlgorithm {
+ public:
+ ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
+ ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
+ ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
+
+ /** selects a good algorithm for convolution for given configuration
+ *
+ * Exception Guarantee: Strong
+ */
+ ConvolutionAlgorithm(
+ const Handle& handle,
+ const ConvolutionDescriptor<T>& conv,
+ const FilterDescriptor<T>& filter,
+ const TensorDescriptor<T>& input,
+ const TensorDescriptor<T>& output)
+ {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionForwardAlgorithm(
+ handle.get(),
+ input.get(), filter.get(), conv.get(), output.get(),
+ CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+ 0, /* no memory limit */
+ &algo
+ )
+ );
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionForwardWorkspaceSize(
+ handle.get(),
+ input.get(), filter.get(), conv.get(), output.get(),
+ algo, &workspace_size
+ )
+ );
+ }
+
+ ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
+ ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
+
+ cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
+
+ /** number of bytes of workspace memory required by the algorithm */
+ std::size_t get_workspace_size() const noexcept { return workspace_size; }
+
+ private:
+ cudnnConvolutionFwdAlgo_t algo;
+ std::size_t workspace_size;
+ };
+
+ /** gives the shape of the output tensor of convolution
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void getConvolutionForwardOutputDim(
+ const ConvolutionDescriptor<T>& convDesc,
+ const FilterDescriptor<T>& filterDesc,
+ const TensorDescriptor<T>& inputDesc,
+ std::vector<int>& output)
+ {
+ output.clear();
+ output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
+
+ std::vector<int> temp(CUDNN_DIM_MAX);
+ cudnnDataType_t tempDataType;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetTensorNdDescriptor(
+ inputDesc.get(),
+ CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
+ &tempDataType,
+ output.data(),
+ temp.data(),
+ temp.data()
+ )
+ );
+
+ const auto rank = output[0];
+ output.resize(rank);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionNdForwardOutputDim(
+ convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
+ )
+ );
+ }
+
+ /** @brief performs convolution
+ *
+ * dstValue = alpha * result + beta * priorDstValue
+ *
+ * @tparam T convolution element type (must be `half` or `float`)
+ *
+ * @param handle valid cuDNN Handle
+ * @param convDesc convolution description
+ * @param convAlgo algorithm to use for convolution
+ * @param workspace workspace memory which meets the requirements of \p convAlgo
+ * @param filterDesc filter descriptor
+ * @param[in] filterPtr pointer to device memory containing the filters
+ * @param inputDesc tensor descriptor describing the input
+ * @param[in] inputPtr pointer to input tensor in device memory
+ * @param alpha result scale factor
+ * @param beta previous value scale factor
+ * @param outputDesc tensor descriptor describing the output
+ * @param[out] outputPtr pointer to output tensor in device memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void convolve(
+ const Handle& handle,
+ const ConvolutionDescriptor<T>& convDesc,
+ const ConvolutionAlgorithm<T>& convAlgo,
+ WorkspaceInstance workspace,
+ const FilterDescriptor<T>& filterDesc,
+ DevicePtr<const T> filterPtr,
+ const TensorDescriptor<T>& inputDesc,
+ DevicePtr<const T> inputPtr,
+ T alpha, T beta,
+ const TensorDescriptor<T>& outputDesc,
+ DevicePtr<T> outputPtr)
+ {
+ CV_Assert(handle);
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnConvolutionForward(
+ handle.get(),
+ &alpha, inputDesc.get(), inputPtr.get(),
+ filterDesc.get(), filterPtr.get(),
+ convDesc.get(), convAlgo.get(),
+ static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+ template <> inline
+ void convolve(
+ const Handle& handle,
+ const ConvolutionDescriptor<half>& convDesc,
+ const ConvolutionAlgorithm<half>& convAlgo,
+ WorkspaceInstance workspace,
+ const FilterDescriptor<half>& filterDesc,
+ DevicePtr<const half> filterPtr,
+ const TensorDescriptor<half>& inputDesc,
+ DevicePtr<const half> inputPtr,
+ half alpha, half beta,
+ const TensorDescriptor<half>& outputDesc,
+ DevicePtr<half> outputPtr)
+ {
+ CV_Assert(handle);
+
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha_ = alpha, beta_ = beta;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnConvolutionForward(
+ handle.get(),
+ &alpha_, inputDesc.get(), inputPtr.get(),
+ filterDesc.get(), filterPtr.get(),
+ convDesc.get(), convAlgo.get(),
+ static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+ &beta_, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
+
+#include "../fp16.hpp"
+#include "../pointer.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+#define CUDA4DNN_CHECK_CUDNN(call) \
+ ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ /** @brief exception class for errors thrown by the cuDNN API */
+ class cuDNNException : public CUDAException {
+ public:
+ using CUDAException::CUDAException;
+ };
+
+ namespace detail {
+ inline void check(cudnnStatus_t status, const char* func, const char* file, int line) {
+ if (status != CUDNN_STATUS_SUCCESS)
+ throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line);
+ }
+
+ /** get_data_type<T> returns the equivalent cudnn enumeration constant for type T */
+ template <class> auto get_data_type()->decltype(CUDNN_DATA_FLOAT);
+ template <> inline auto get_data_type<half>()->decltype(CUDNN_DATA_HALF) { return CUDNN_DATA_HALF; }
+ template <> inline auto get_data_type<float>()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; }
+ }
+
+ /** @brief noncopyable cuDNN smart handle
+ *
+ * UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle
+ * is destroyed after use.
+ */
+ class UniqueHandle {
+ public:
+ /** creates a cuDNN handle which executes in the default stream
+ *
+ * Exception Guarantee: Basic
+ */
+ UniqueHandle() { CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); }
+
+ UniqueHandle(UniqueHandle&) = delete;
+ UniqueHandle(UniqueHandle&& other) noexcept
+ : stream(std::move(other.stream)), handle{ other.handle } {
+ other.handle = nullptr;
+ }
+
+ /** creates a cuDNN handle and associates it with the stream specified
+ *
+ * Exception Guarantee: Basic
+ */
+ UniqueHandle(Stream strm) : stream(std::move(strm)) {
+ CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle));
+ try {
+ CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, stream.get()));
+ } catch (...) {
+ /* cudnnDestroy won't throw if a valid handle is passed */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
+ throw;
+ }
+ }
+
+ ~UniqueHandle() noexcept {
+ if (handle != nullptr) {
+ /* cudnnDestroy won't throw if a valid handle is passed */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
+ }
+ }
+
+ UniqueHandle& operator=(const UniqueHandle&) = delete;
+ UniqueHandle& operator=(UniqueHandle&& other) noexcept {
+ stream = std::move(other.stream);
+ handle = other.handle;
+ other.handle = nullptr;
+ return *this;
+ }
+
+ /** returns the raw cuDNN handle */
+ cudnnHandle_t get() const noexcept { return handle; }
+
+ private:
+ Stream stream;
+ cudnnHandle_t handle;
+ };
+
+ /** @brief sharable cuDNN smart handle
+ *
+ * Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle
+ * is destroyed after all references to the handle are destroyed.
+ *
+ * @note Moving a Handle object to another invalidates the former
+ */
+ class Handle {
+ public:
+ /** creates a cuDNN handle which executes in the default stream
+ *
+ * Exception Guarantee: Basic
+ */
+ Handle() : handle(std::make_shared<UniqueHandle>()) { }
+
+ Handle(const Handle&) = default;
+ Handle(Handle&&) = default;
+
+ /** creates a cuDNN handle and associates it with the stream specified
+ *
+ * Exception Guarantee: Basic
+ */
+ Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
+
+ Handle& operator=(const Handle&) = default;
+ Handle& operator=(Handle&&) = default;
+
+ /** returns true if the handle is valid */
+ explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+ cudnnHandle_t get() const noexcept {
+ CV_Assert(handle);
+ return handle->get();
+ }
+
+ private:
+ std::shared_ptr<UniqueHandle> handle;
+ };
+
+ /** describe a tensor
+ *
+ * @tparam T type of elements in the tensor
+ */
+ template <class T>
+ class TensorDescriptor {
+ public:
+ TensorDescriptor() noexcept : descriptor{ nullptr } { }
+ TensorDescriptor(const TensorDescriptor&) = delete;
+ TensorDescriptor(TensorDescriptor&& other) noexcept
+ : descriptor{ other.descriptor } {
+ other.descriptor = nullptr;
+ }
+
+ /** constructs a tensor descriptor from the axis lengths provided in \p shape
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+ TensorDescriptor(const SequenceContainer& shape) {
+ constructor(shape.begin(), shape.end());
+ }
+
+ /** constructs a tensor descriptor from the axis lengths provided in [begin, end)
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
+ TensorDescriptor(ForwardItr begin, ForwardItr end) {
+ constructor(begin, end);
+ }
+
+ /** constructs a tensor descriptor from the axis lengths provided as arguments
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class ...Sizes>
+ TensorDescriptor(Sizes ...sizes) {
+ static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
+ std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
+ constructor(std::begin(dims), std::end(dims));
+ }
+
+ ~TensorDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyTensorDescriptor will not fail */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
+ }
+ }
+
+ TensorDescriptor& operator=(const TensorDescriptor&) = delete;
+ TensorDescriptor& operator=(TensorDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnTensorDescriptor_t get() const noexcept { return descriptor; }
+
+ private:
+ template <class ForwardItr>
+ void constructor(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
+
+ CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor));
+ try {
+ /* cuDNN documentation recommends using the 4d tensor API whenever possible
+ * hence, we create a 4d tensor descriptors for 3d tensor
+ */
+ const auto rank = std::distance(start, end);
+ if (rank <= 4) {
+ std::array<int, 4> dims;
+ std::fill(std::begin(dims), std::end(dims), 1);
+
+ /* suppose we have a 3d tensor, the first axis is the batch axis and
+ * the second axis is the channel axis (generally)
+ *
+ * cuDNN frequently assumes that the first axis is the batch axis and the
+ * second axis is the channel axis; hence, we copy the shape of a lower rank
+ * tensor to the begining of `dims`
+ */
+ std::copy(start, end, std::begin(dims));
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetTensor4dDescriptor(descriptor,
+ CUDNN_TENSOR_NCHW, detail::get_data_type<T>(),
+ dims[0], dims[1], dims[2], dims[3]
+ )
+ );
+ } else {
+ std::vector<int> stride(rank);
+ stride.back() = 1;
+ /* WHAT WE HAVE NOW:
+ * stride[-1] = 1
+ * stride[-2] = garbage
+ * stride[-3] = garbage
+ * stride[-4] = garbage
+ * ...
+ */
+
+ std::copy(start + 1, end, stride.begin());
+ /* WHAT WE HAVE NOW:
+ * stride[-1] = 1
+ * stride[-2] = dim[-1]
+ * stride[-3] = dim[-2]
+ * stride[-4] = dim[-3]
+ * ...
+ */
+
+ std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies<int>());
+ /* WHAT WE HAVE NOW:
+ * stride[-1] = 1
+ * stride[-2] = stride[-1] * dim[-1]
+ * stride[-3] = stride[-2] * dim[-2]
+ * stride[-4] = stride[-3] * dim[-3]
+ * ...
+ */
+
+ std::vector<int> dims(start, end);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetTensorNdDescriptor(descriptor,
+ detail::get_data_type<T>(), rank,
+ dims.data(), stride.data()
+ )
+ );
+ }
+ } catch (...) {
+ /* cudnnDestroyTensorDescriptor will not fail */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnTensorDescriptor_t descriptor;
+ };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cudnn.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ class LRNDescriptor {
+ public:
+ enum class LRNType {
+ ACROSS_CHANNELS,
+ WITHIN_CHANNEL
+ };
+
+ LRNDescriptor() noexcept : descriptor{ nullptr } { }
+ LRNDescriptor(const LRNDescriptor&) = delete;
+ LRNDescriptor(LRNDescriptor&& other) noexcept
+ : descriptor{ other.descriptor }, type{ other.type } {
+ other.descriptor = nullptr;
+ }
+
+ /** sets up a LRN descriptor
+ *
+ * @param local_size size of the normalization window
+ * @param alpha variance scaling parameter
+ * @param beta power parameter
+ * @param k bias parameter
+ *
+ * @note \p alpha is divided by the window width in across channels mode
+ * @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode
+ *
+ * @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation
+ *
+ * Exception Guarantee: Basic
+ */
+ LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
+ constructor(local_size, alpha, beta, k, type_);
+ }
+
+ ~LRNDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
+ }
+ }
+
+ LRNDescriptor& operator=(const LRNDescriptor&) = delete;
+ LRNDescriptor& operator=(LRNDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ type = other.type;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnLRNDescriptor_t get() const noexcept { return descriptor; }
+ LRNType getType() const noexcept { return type; }
+
+ private:
+ void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
+ CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N);
+
+ type = type_;
+
+ CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor));
+ try {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetLRNDescriptor(
+ descriptor,
+ local_size,
+ alpha,
+ beta,
+ k
+ )
+ );
+ } catch (...) {
+ /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnLRNDescriptor_t descriptor;
+ LRNType type;
+ };
+
+ /** @brief performs local response normalization
+ *
+ * dstValue = alpha * result + beta * priorDstValue
+ *
+ * @tparam T element type (must be `half` or `float`)
+ *
+ * @param handle valid cuDNN Handle
+ * @param lrnDesc LRN description
+ * @param inputDesc tensor descriptor describing the input
+ * @param[in] inputPtr pointer to input tensor in device memory
+ * @param alpha result scale factor
+ * @param beta previous value scale factor
+ * @param outputDesc tensor descriptor describing the output
+ * @param[out] outputPtr pointer to output tensor in device memory
+ * @param workspace workspace memory which meets the requirements of \p convAlgo
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void LRNForward(
+ const Handle& handle,
+ const LRNDescriptor& lrnDesc,
+ const TensorDescriptor<T>& inputDesc,
+ DevicePtr<const T> inputPtr,
+ T alpha, T beta,
+ const TensorDescriptor<T>& outputDesc,
+ DevicePtr<T> outputPtr,
+ WorkspaceInstance workspace)
+ {
+ CV_Assert(handle);
+
+ if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnLRNCrossChannelForward(
+ handle.get(),
+ lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+ &alpha, inputDesc.get(), inputPtr.get(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
+ std::size_t size;
+ CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
+
+ DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
+ DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnDivisiveNormalizationForward(
+ handle.get(),
+ lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+ &alpha, inputDesc.get(), inputPtr.get(),
+ NULL,
+ static_cast<void*>(temp1), static_cast<void*>(temp2),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+ }
+
+ template <> inline
+ void LRNForward(
+ const Handle& handle,
+ const LRNDescriptor& lrnDesc,
+ const TensorDescriptor<half>& inputDesc,
+ DevicePtr<const half> inputPtr,
+ half alpha, half beta,
+ const TensorDescriptor<half>& outputDesc,
+ DevicePtr<half> outputPtr,
+ WorkspaceInstance workspace)
+ {
+ CV_Assert(handle);
+
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha_ = alpha, beta_ = beta;
+ if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnLRNCrossChannelForward(
+ handle.get(),
+ lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+ &alpha_, inputDesc.get(), inputPtr.get(),
+ &beta_, outputDesc.get(), outputPtr.get()
+ )
+ );
+ } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
+ std::size_t size;
+ CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
+
+ DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
+ DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnDivisiveNormalizationForward(
+ handle.get(),
+ lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+ &alpha_, inputDesc.get(), inputPtr.get(),
+ NULL,
+ static_cast<void*>(temp1), static_cast<void*>(temp2),
+ &beta_, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ class PoolingDescriptor {
+ public:
+ enum class PoolingType {
+ MAX,
+ MAX_DETERMINISTIC,
+ AVERAGE_EXCLUDE_PADDING,
+ AVERAGE_INCLUDE_PADDING
+ };
+
+ PoolingDescriptor() noexcept : descriptor{ nullptr } { }
+ PoolingDescriptor(const PoolingDescriptor&) = delete;
+ PoolingDescriptor(PoolingDescriptor&& other) noexcept
+ : descriptor{ other.descriptor } {
+ other.descriptor = nullptr;
+ }
+
+ /** constructs a pooling descriptor
+ *
+ * Pre-conditions:
+ * - \p window_size, \p padding and \p stride must have the same size
+ *
+ * The length of the containers is interpreted as the order of the pooling operation.
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+ PoolingDescriptor(
+ const SequenceContainer& window_size,
+ const SequenceContainer& padding,
+ const SequenceContainer& stride,
+ PoolingType type)
+ {
+ constructor(window_size, padding, stride, type);
+ }
+
+ ~PoolingDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
+ }
+ }
+
+ PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
+ PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnPoolingDescriptor_t get() const noexcept { return descriptor; }
+
+ private:
+ template <class SequenceContainer>
+ void constructor(
+ const SequenceContainer& window_size,
+ const SequenceContainer& padding,
+ const SequenceContainer& stride,
+ PoolingType type)
+ {
+ CV_Assert(window_size.size() == padding.size());
+ CV_Assert(window_size.size() == stride.size());
+
+ auto get_pooling_type = [] (PoolingType type) {
+ switch (type) {
+ case PoolingType::MAX:
+ return CUDNN_POOLING_MAX;
+ case PoolingType::MAX_DETERMINISTIC:
+ return CUDNN_POOLING_MAX_DETERMINISTIC;
+ case PoolingType::AVERAGE_EXCLUDE_PADDING:
+ return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+ case PoolingType::AVERAGE_INCLUDE_PADDING:
+ return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+ }
+ CV_Error(Error::StsBadArg, "unknown pooling type");
+ };
+
+ CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor));
+ try {
+ const auto rank = window_size.size();
+ if (rank == 2) {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetPooling2dDescriptor(
+ descriptor,
+ get_pooling_type(type), CUDNN_PROPAGATE_NAN,
+ window_size[0], window_size[1],
+ padding[0], padding[1],
+ stride[0], stride[1]
+ )
+ );
+ } else {
+ std::vector<int> iwindow_size(std::begin(window_size), std::end(window_size));
+ std::vector<int> ipadding(std::begin(padding), std::end(padding));
+ std::vector<int> istride(std::begin(stride), std::end(stride));
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetPoolingNdDescriptor(
+ descriptor,
+ get_pooling_type(type), CUDNN_PROPAGATE_NAN,
+ rank, iwindow_size.data(), ipadding.data(), istride.data()
+ )
+ );
+ }
+ } catch (...) {
+ /* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnPoolingDescriptor_t descriptor;
+ };
+
+ /** gives the shape of the output tensor after pooling
+ *
+ * @note it's not required to enforce the this shape in the output tensor; slightly different shapes will work
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T> inline
+ void getPoolingForwardOutputDim(
+ const PoolingDescriptor& poolingDesc,
+ const TensorDescriptor<T>& inputDesc,
+ std::vector<int>& output_dim)
+ {
+ output_dim.clear();
+ output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */
+
+ std::vector<int> temp(CUDNN_DIM_MAX);
+ cudnnDataType_t tempDataType;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetTensorNdDescriptor(
+ inputDesc.get(),
+ CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
+ &tempDataType,
+ output_dim.data(),
+ temp.data(),
+ temp.data()
+ )
+ );
+
+ const auto rank = output_dim[0];
+ output_dim.resize(rank);
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data())
+ );
+ }
+
+ /** @brief performs pooling operation
+ *
+ * dstValue = alpha * result + beta * priorDstValue
+ *
+ * @tparam T pooling element type (must be `half` or `float`)
+ *
+ * @param handle valid cuDNN Handle
+ * @param poolingDesc pooling description
+ * @param inputDesc tensor descriptor describing the input
+ * @param[in] inputPtr pointer to input tensor in device memory
+ * @param alpha result scale factor
+ * @param beta previous value scale factor
+ * @param outputDesc tensor descriptor describing the output
+ * @param[out] outputPtr pointer to output tensor in device memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void pool(
+ const Handle& handle,
+ const PoolingDescriptor& poolingDesc,
+ const TensorDescriptor<T>& inputDesc,
+ const DevicePtr<const T> inputPtr,
+ T alpha, T beta,
+ const TensorDescriptor<T>& outputDesc,
+ DevicePtr<T> outputPtr)
+ {
+ CV_Assert(handle);
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnPoolingForward(
+ handle.get(),
+ poolingDesc.get(),
+ &alpha, inputDesc.get(), inputPtr.get(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+ template <> inline
+ void pool(
+ const Handle& handle,
+ const PoolingDescriptor& poolingDesc,
+ const TensorDescriptor<half>& inputDesc,
+ const DevicePtr<const half> inputPtr,
+ half alpha, half beta,
+ const TensorDescriptor<half>& outputDesc,
+ DevicePtr<half> outputPtr)
+ {
+ CV_Assert(handle);
+
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha_ = alpha, beta_ = beta;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnPoolingForward(
+ handle.get(),
+ poolingDesc.get(),
+ &alpha_, inputDesc.get(), inputPtr.get(),
+ &beta_, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+
+#include <cudnn.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ /** @brief computes softmax (or log softmax)
+ *
+ * @tparam T element type (must be `half` or `float`)
+ *
+ * @param handle valid cuDNN handle
+ * @param outputDesc tensor descriptor for A
+ * @param[out] output pointer to tensor in device memory
+ * @param inputDesc tensor descriptor for C
+ * @param[in] input pointer to tensor in device memory
+ * @param log apply log on probabilities
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void softmax(const cudnn::Handle& handle,
+ const TensorDescriptor<T>& outputDesc, DevicePtr<T> output,
+ const TensorDescriptor<T>& inputDesc, DevicePtr<const T> input,
+ bool log)
+ {
+ T alpha = 1.0, beta = 0.0;
+ cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSoftmaxForward(
+ handle.get(),
+ algo, CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha, inputDesc.get(), input.get(),
+ &beta, outputDesc.get(), output.get()
+ )
+ );
+ }
+
+ template <> inline
+ void softmax(const cudnn::Handle& handle,
+ const TensorDescriptor<half>& outputDesc, DevicePtr<half> output,
+ const TensorDescriptor<half>& inputDesc, DevicePtr<const half> input,
+ bool log)
+ {
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha = 1.0, beta = 0.0;
+ cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSoftmaxForward(
+ handle.get(),
+ algo, CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha, inputDesc.get(), input.get(),
+ &beta, outputDesc.get(), output.get()
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
+
+#include "../pointer.hpp"
+
+#include "cudnn.hpp"
+
+#include <cudnn.h>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ /** describes a tensor transform operation
+ *
+ * Supported transformations:
+ * - add or remove asymmetric padding
+ */
+ class TensorTransformDescriptor {
+ public:
+ TensorTransformDescriptor() noexcept : descriptor{ nullptr } { }
+ TensorTransformDescriptor(const TensorTransformDescriptor&) = delete;
+ TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept
+ : descriptor{ other.descriptor } {
+ other.descriptor = nullptr;
+ }
+
+ /** constructs a convolution descriptor
+ *
+ * Pre-conditions:
+ * - \p padding_left and \p padding_right must have the same size
+ *
+ * The length of the containers is interpreted as the rank of the tensors which will be given.
+ *
+ * @note \p padding_left and \p padding_right may have negative values to remove padding
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+ TensorTransformDescriptor(
+ const SequenceContainer& padding_left,
+ const SequenceContainer& padding_right)
+ {
+ constructor(padding_left, padding_right);
+ }
+
+ ~TensorTransformDescriptor() noexcept {
+ if (descriptor != nullptr) {
+ /* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
+ }
+ }
+
+ TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete;
+ TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept {
+ descriptor = other.descriptor;
+ other.descriptor = nullptr;
+ return *this;
+ };
+
+ cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; }
+
+ private:
+ template <class SequenceContainer>
+ void constructor(
+ const SequenceContainer& padding_left,
+ const SequenceContainer& padding_right
+ )
+ {
+ CV_Assert(padding_left.size() == padding_right.size());
+
+ auto ipadding_left = std::vector<int32_t>(std::begin(padding_left), std::end(padding_left));
+ auto ipadding_right = std::vector<int32_t>(std::begin(padding_right), std::end(padding_right));
+ CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor));
+ try {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnSetTensorTransformDescriptor(
+ descriptor,
+ ipadding_left.size(), CUDNN_TENSOR_NCHW,
+ ipadding_left.data(), ipadding_right.data(),
+ NULL, CUDNN_TRANSFORM_FOLD
+ )
+ );
+ } catch (...) {
+ /* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
+ CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
+ throw;
+ }
+ }
+
+ cudnnTensorTransformDescriptor_t descriptor;
+ };
+
+ template <class T>
+ void transform(
+ const Handle& handle,
+ const TensorTransformDescriptor& transDesc,
+ const TensorDescriptor<T>& inputDesc,
+ DevicePtr<const T> inputPtr,
+ const TensorDescriptor<T>& outputDesc,
+ DevicePtr<T> outputPtr)
+ {
+ T alpha = 1.0, beta = 0.0;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnTransformTensorEx(
+ handle.get(),
+ transDesc.get(),
+ &alpha, inputDesc.get(), inputPtr.get(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+ template <> inline
+ void transform(
+ const Handle& handle,
+ const TensorTransformDescriptor& transDesc,
+ const TensorDescriptor<half>& inputDesc,
+ DevicePtr<const half> inputPtr,
+ const TensorDescriptor<half>& outputDesc,
+ DevicePtr<half> outputPtr)
+ {
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha = 1.0, beta = 0.0;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnTransformTensorEx(
+ handle.get(),
+ transDesc.get(),
+ &alpha, inputDesc.get(), inputPtr.get(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
+
+#include "cudnn.hpp"
+#include "convolution.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+ /** wrapper around a transpose convolution algorithm
+ *
+ * @tparam T type of elements being transpose-convolved
+ */
+ template <class T>
+ class TransposeConvolutionAlgorithm {
+ public:
+ TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
+ TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default;
+ TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default;
+
+ TransposeConvolutionAlgorithm(
+ const Handle& handle,
+ const ConvolutionDescriptor<T>& conv,
+ const FilterDescriptor<T>& filter,
+ const TensorDescriptor<T>& input,
+ const TensorDescriptor<T>& output)
+ {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionBackwardDataAlgorithm(
+ handle.get(),
+ filter.get(), input.get(), conv.get(), output.get(),
+ CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+ 0, /* no memory limit */
+ &dalgo
+ )
+ );
+
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnGetConvolutionBackwardDataWorkspaceSize(
+ handle.get(),
+ filter.get(), input.get(), conv.get(), output.get(),
+ dalgo, &workspace_size
+ )
+ );
+ }
+
+ TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
+ TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default;
+
+ cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; }
+
+ std::size_t get_workspace_size() const noexcept { return workspace_size; }
+
+ private:
+ cudnnConvolutionBwdDataAlgo_t dalgo;
+ std::size_t workspace_size;
+ };
+
+ /** @brief performs transpose convolution
+ *
+ * dstValue = alpha * result + beta * priorDstValue
+ *
+ * @tparam T transpose convolution element type (must be `half` or `float`)
+ *
+ * @param handle valid cuDNN Handle
+ * @param convDesc convolution description
+ * @param transConvAlgo algorithm to use for convolution
+ * @param workspace workspace memory which meets the requirements of \p convAlgo
+ * @param filterDesc filter descriptor
+ * @param[in] filterPtr pointer to device memory containing the filters
+ * @param inputDesc tensor descriptor describing the input
+ * @param[in] inputPtr pointer to input tensor in device memory
+ * @param alpha result scale factor
+ * @param beta previous value scale factor
+ * @param outputDesc tensor descriptor describing the output
+ * @param[out] outputPtr pointer to output tensor in device memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void transpose_convolve(
+ const Handle& handle,
+ const ConvolutionDescriptor<T>& convDesc,
+ const TransposeConvolutionAlgorithm<T>& transConvAlgo,
+ WorkspaceInstance workspace,
+ const FilterDescriptor<T>& filterDesc,
+ DevicePtr<const T> filterPtr,
+ const TensorDescriptor<T>& inputDesc,
+ DevicePtr<const T> inputPtr,
+ T alpha, T beta,
+ const TensorDescriptor<T>& outputDesc,
+ DevicePtr<T> outputPtr)
+ {
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnConvolutionBackwardData(
+ handle.get(),
+ &alpha,
+ filterDesc.get(), filterPtr.get(),
+ inputDesc.get(), inputPtr.get(),
+ convDesc.get(), transConvAlgo.get(),
+ static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+ &beta, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+ template <> inline
+ void transpose_convolve(
+ const Handle& handle,
+ const ConvolutionDescriptor<half>& convDesc,
+ const TransposeConvolutionAlgorithm<half>& convAlgo,
+ WorkspaceInstance workspace,
+ const FilterDescriptor<half>& filterDesc,
+ DevicePtr<const half> filterPtr,
+ const TensorDescriptor<half>& inputDesc,
+ DevicePtr<const half> inputPtr,
+ half alpha, half beta,
+ const TensorDescriptor<half>& outputDesc,
+ DevicePtr<half> outputPtr)
+ {
+ /* we specalize for fp16 as the scaling factors must be provided as `float` */
+ float alpha_ = alpha, beta_ = beta;
+ CUDA4DNN_CHECK_CUDNN(
+ cudnnConvolutionBackwardData(
+ handle.get(),
+ &alpha_,
+ filterDesc.get(), filterPtr.get(),
+ inputDesc.get(), inputPtr.get(),
+ convDesc.get(), convAlgo.get(),
+ static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+ &beta_, outputDesc.get(), outputPtr.get()
+ )
+ );
+ }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#define CUDA4DNN_CHECK_CUDA(call) \
+ ::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+ /** @brief exception class for errors thrown by the CUDA APIs */
+ class CUDAException : public cv::Exception {
+ public:
+ using cv::Exception::Exception;
+ };
+
+ namespace detail {
+ inline void check(cudaError_t err, const char* func, const char* file, int line) {
+ if (err != cudaSuccess)
+ throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+ }
+ }
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
+
+#include "error.hpp"
+#include "stream.hpp"
+
+#include <opencv2/core/utils/logger.hpp>
+
+#include <cuda_runtime_api.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** @brief sharable CUDA event
+ *
+ * Event is a smart sharable wrapper for CUDA event handle which ensures that
+ * the handle is destroyed after use.
+ *
+ * @note Moving an Event object to another invalidates the former
+ */
+ class Event {
+ public:
+ Event() noexcept : event{ nullptr } { }
+ Event(const Event&) = delete;
+ Event(Event&& other) noexcept
+ : event{ other.event } {
+ other.event = nullptr;
+ }
+
+ /** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
+ Event(bool create, bool timing_event = false) : event{nullptr} {
+ if (create) {
+ unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming);
+ CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
+ }
+ }
+
+ ~Event() {
+ try {
+ if (event != nullptr)
+ CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event));
+ } catch (const CUDAException& ex) {
+ std::ostringstream os;
+ os << "Asynchronous exception caught during CUDA event destruction.\n";
+ os << ex.what();
+ os << "Exception will be ignored.\n";
+ CV_LOG_WARNING(0, os.str().c_str());
+ }
+ }
+
+ Event& operator=(const Event&) noexcept = delete;
+ Event& operator=(Event&& other) noexcept {
+ event = other.event;
+ other.event = nullptr;
+ return *this;
+ }
+
+ /** mark a point in \p stream */
+ void record(const Stream& stream) {
+ CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
+ }
+
+ /** blocks the caller thread until all operations before the event finish */
+ void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); }
+
+ /** returns true if there are operations pending before the event completes */
+ bool busy() const {
+ auto status = cudaEventQuery(event);
+ if (status == cudaErrorNotReady)
+ return true;
+ CUDA4DNN_CHECK_CUDA(status);
+ return false;
+ }
+
+ cudaEvent_t get() const noexcept { return event; }
+
+ /** returns true if the event is valid */
+ explicit operator bool() const noexcept { return event; }
+
+ private:
+ cudaEvent_t event;
+ };
+
+ /** makes a stream wait on an event */
+ void StreamWaitOnEvent(const Stream& stream, const Event& event) {
+ CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
+ }
+
+ /** returns the time elapsed between two events in milliseconds */
+ float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
+ float temp;
+ CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
+ return temp;
+ }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP
+
+#include "nvcc_defs.hpp"
+
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ namespace detail {
+ template <class T, class = void>
+ struct is_half_convertible : std::false_type { };
+
+ template <class T>
+ struct is_half_convertible<T, typename std::enable_if<std::is_integral<T>::value, void>::type> : std::true_type { };
+
+ template <class T>
+ struct is_half_convertible<T, typename std::enable_if<std::is_floating_point<T>::value, void>::type> : std::true_type { };
+ }
+
+ /* Note: nvcc has a broken overload resolution; it considers host overloads inside device code
+ CUDA4DNN_HOST bool operator==(half lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+ CUDA4DNN_HOST bool operator!=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+ CUDA4DNN_HOST bool operator<(half lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+ CUDA4DNN_HOST bool operator>(half lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+ CUDA4DNN_HOST bool operator<=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+ CUDA4DNN_HOST bool operator>=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+ */
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator==(half lhs, T rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator!=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator<(half lhs, T rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator>(half lhs, T rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator<=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator>=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator==(T lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator!=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator<(T lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator>(T lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator<=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+
+ template <class T> CUDA4DNN_HOST
+ typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+ ::type operator>=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
+
+#include "error.hpp"
+#include "pointer.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <memory>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /* @brief smart device pointer with allocation/deallocation methods
+ *
+ * ManagedPtr is a smart shared device pointer which also handles memory allocation.
+ */
+ template <class T>
+ class ManagedPtr {
+ static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
+ static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+ public:
+ using element_type = T;
+
+ using pointer = DevicePtr<element_type>;
+ using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
+
+ using size_type = std::size_t;
+
+ ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
+ ManagedPtr(const ManagedPtr&) noexcept = default;
+ ManagedPtr(ManagedPtr&& other) noexcept
+ : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
+ {
+ other.reset();
+ }
+
+ /** allocates device memory for \p count number of element */
+ ManagedPtr(size_type count) {
+ if (count <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements is zero or negative");
+ }
+
+ void* temp = nullptr;
+ CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
+
+ auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
+ wrapped.reset(ptr, [](element_type* ptr) {
+ if (ptr != nullptr) {
+ /* contract violation for std::shared_ptr if cudaFree throws */
+ try {
+ CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
+ } catch (const CUDAException& ex) {
+ std::ostringstream os;
+ os << "Device memory deallocation failed in deleter.\n";
+ os << ex.what();
+ os << "Exception will be ignored.\n";
+ CV_LOG_WARNING(0, os.str().c_str());
+ }
+ }
+ });
+ /* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
+ * need to have a try-catch block to free the allocated device memory
+ */
+
+ n = capacity = count;
+ }
+
+ ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+ wrapped = std::move(other.wrapped);
+ n = other.n;
+ capacity = other.capacity;
+
+ other.reset();
+ return *this;
+ }
+
+ size_type size() const noexcept { return n; }
+
+ void reset() noexcept { wrapped.reset(); n = capacity = 0; }
+
+ /**
+ * deallocates any previously allocated memory and allocates device memory
+ * for \p count number of elements
+ *
+ * @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
+ * @note use move constructor to guarantee a deallocation of the previously allocated memory
+ *
+ * Exception Guarantee: Strong
+ */
+ void reset(size_type count) {
+ /* we need to fully own the memory to perform optimizations */
+ if (wrapped.use_count() == 1) {
+ /* avoid reallocation if the existing capacity is sufficient */
+ if (count <= capacity) {
+ n = count;
+ return;
+ }
+ }
+
+ /* no optimization performed; allocate memory */
+ ManagedPtr tmp(count);
+ swap(tmp, *this);
+ }
+
+ pointer get() const noexcept { return pointer(wrapped.get()); }
+
+ explicit operator bool() const noexcept { return wrapped; }
+
+ friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
+ friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
+
+ friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
+ using std::swap;
+ swap(lhs.wrapped, rhs.wrapped);
+ swap(lhs.n, rhs.n);
+ swap(lhs.capacity, rhs.capacity);
+ }
+
+ private:
+ std::shared_ptr<element_type> wrapped;
+ size_type n, capacity;
+ };
+
+ /** copies entire memory block pointed by \p src to \p dest
+ *
+ * \param[in] src device pointer
+ * \param[out] dest host pointer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(T *dest, const ManagedPtr<T>& src) {
+ memcpy<T>(dest, src.get(), src.size());
+ }
+
+ /** copies data from memory pointed by \p src to fully fill \p dest
+ *
+ * \param[in] src host pointer
+ * \param[out] dest device pointer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p src must be at least as big as the memory block held by \p dest
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(const ManagedPtr<T>& dest, const T* src) {
+ memcpy<T>(dest.get(), src, dest.size());
+ }
+
+ /** copies data from memory pointed by \p src to \p dest
+ *
+ * if the two \p src and \p dest have different sizes, the number of elements copied is
+ * equal to the size of the smaller memory block
+ *
+ * \param[in] src device pointer
+ * \param[out] dest device pointer
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
+ memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
+ }
+
+ /** sets device memory block to a specific 8-bit value
+ *
+ * \param[in] src device pointer
+ * \param[out] ch 8-bit value to fill the device memory with
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
+ memset<T>(dest.get(), ch, dest.size());
+ }
+
+ /** copies entire memory block pointed by \p src to \p dest asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] dest host pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+ * - \p dest points to page-locked memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
+ CV_Assert(stream);
+ memcpy<T>(dest, src.get(), src.size(), stream);
+ }
+
+ /** copies data from memory pointed by \p src to \p dest asynchronously
+ *
+ * \param[in] src host pointer
+ * \param[out] dest device pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+ * - \p src points to page-locked memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
+ CV_Assert(stream);
+ memcpy<T>(dest.get(), src, dest.size(), stream);
+ }
+
+ /** copies data from memory pointed by \p src to \p dest asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] dest device pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * if the two \p src and \p dest have different sizes, the number of elements copied is
+ * equal to the size of the smaller memory block
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
+ CV_Assert(stream);
+ memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
+ }
+
+ /** sets device memory block to a specific 8-bit value asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] ch 8-bit value to fill the device memory with
+ * \param stream CUDA stream that has to be used for the memory operation
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
+ CV_Assert(stream);
+ memset<T>(dest.get(), ch, dest.size(), stream);
+ }
+
+ /** @brief registers host memory as page-locked and unregisters on destruction */
+ class MemoryLockGuard {
+ public:
+ MemoryLockGuard() noexcept : ptr { nullptr } { }
+ MemoryLockGuard(const MemoryLockGuard&) = delete;
+ MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
+ other.ptr = nullptr;
+ }
+
+ /** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
+ *
+ * Pre-conditons:
+ * - host memory should be unregistered
+ */
+ MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
+ CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
+ ptr = ptr_;
+ }
+
+ MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
+ MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
+ ptr = other.ptr;
+ other.ptr = nullptr;
+ return *this;
+ }
+
+ ~MemoryLockGuard() {
+ if(ptr != nullptr)
+ CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
+ }
+
+ private:
+ void *ptr;
+ };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
+
+#include <cuda_runtime_api.h>
+
+#ifdef __CUDACC__
+# define CUDA4DNN_HOST __host__
+# define CUDA4DNN_DEVICE __device__
+# define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE
+#else
+# define CUDA4DNN_HOST
+# define CUDA4DNN_DEVICE
+# define CUDA4DNN_HOST_DEVICE
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
+
+#include "nvcc_defs.hpp"
+#include "error.hpp"
+#include "stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <ostream>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** @brief provides a type-safe device pointer
+ *
+ * DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert
+ * to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen.
+ *
+ * It is meant to point to locations in device memory. Hence, it provides dereferencing or
+ * array subscript capability for device code only.
+ *
+ * A `const DevicePtr<T>` represents an immutable pointer to a mutable memory.
+ * A `DevicePtr<const T>` represents a mutable pointer to an immutable memory.
+ * A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory.
+ *
+ * A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`.
+ *
+ * Specalizations:
+ * - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided)
+ * - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void>
+ * - any device pointer is implicitly convertible to DevicePtr<const void>
+ * - DevicePtr<void> can be explicitly converted to any device pointer
+ * - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory
+ */
+ template <class T>
+ class DevicePtr {
+ static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+ public:
+ using element_type = T;
+ using difference_type = std::ptrdiff_t;
+ using pointer = typename std::add_pointer<element_type>::type;
+ using reference = typename std::add_lvalue_reference<element_type>::type;
+
+ DevicePtr() = default;
+ CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+ CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+ CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; }
+ CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); }
+ CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); }
+
+ template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
+ CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept {
+ return DevicePtr<typename std::add_const<U>::type>{ptr};
+ }
+
+ CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept {
+ ++ptr;
+ return *this;
+ }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept {
+ auto tmp = DevicePtr(*this);
+ ptr++;
+ return tmp;
+ }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept {
+ --ptr;
+ return *this;
+ }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept {
+ auto tmp = DevicePtr(*this);
+ ptr--;
+ return tmp;
+ }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept {
+ ptr += offset;
+ return *this;
+ }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept {
+ ptr -= offset;
+ return *this;
+ }
+
+ CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
+ return lhs += offset;
+ }
+
+ CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
+ return lhs -= offset;
+ }
+
+ CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept {
+ return lhs.ptr - rhs.ptr;
+ }
+
+ CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+ CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+ CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+ CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+ using std::swap;
+ swap(lhs.ptr, rhs.ptr);
+ }
+
+ template <class U, class V>
+ CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+ os << other.get() << " (device)";
+ return os;
+ }
+
+ private:
+ pointer ptr;
+ };
+
+ template <>
+ class DevicePtr<const void> {
+ public:
+ using element_type = const void;
+ using pointer = typename std::add_pointer<element_type>::type;
+
+ DevicePtr() = default;
+
+ /* host const void pointer to const void device pointer */
+ CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+ /* allow any device pointer to be implicitly convereted to void device pointer */
+ template <class T>
+ CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+ CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+ CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+ CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+ CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+ /* explicit conversion into host void pointer */
+ CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+ /* const void device pointer can be explicitly casted into any const device pointer type */
+ template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true>
+ CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
+ return static_cast<T*>(ptr);
+ }
+
+ CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+ using std::swap;
+ swap(lhs.ptr, rhs.ptr);
+ }
+
+ template <class U, class V>
+ CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+ os << other.get() << " (device)";
+ return os;
+ }
+
+ private:
+ pointer ptr;
+ };
+
+ template <>
+ class DevicePtr<void> {
+ public:
+ using element_type = void;
+ using pointer = typename std::add_pointer<element_type>::type;
+
+ DevicePtr() = default;
+
+ /* host pointer to device pointer */
+ CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+ /* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */
+ template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false>
+ CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { }
+
+ CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+ CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+ CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; }
+
+ CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+ CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+ CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+ CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+ CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+ /* explicit conversion into host void pointer */
+ CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+ /* void device pointer can be explicitly casted into any device pointer type */
+ template <class T>
+ CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
+ return DevicePtr<T>(static_cast<T*>(ptr));
+ }
+
+ CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+ using std::swap;
+ swap(lhs.ptr, rhs.ptr);
+ }
+
+ template <class U, class V>
+ CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+ os << other.get() << " (device)";
+ return os;
+ }
+
+ private:
+ pointer ptr;
+ };
+
+ template <class T>
+ bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) {
+ auto addr = reinterpret_cast<std::intptr_t>(ptr.get());
+ return addr % alignment == 0;
+ }
+
+ /** copies \p n elements from \p src to \p dest4
+ *
+ * \param[in] src device pointer
+ * \param[out] dest host pointer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault));
+ }
+
+ /** copies \p n elements from \p src to \p dest
+ *
+ * \param[in] src host pointer
+ * \param[out] dest device pointer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault));
+ }
+
+ /** copies \p n elements from \p src to \p dest
+ *
+ * \param[in] src device pointer
+ * \param[out] dest device pointer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault));
+ }
+
+ /** sets \p n elements to \p ch in \p dest
+ *
+ * \param[in] src device pointer
+ * \param[out] ch 8-bit value to fill the device memory with
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T)));
+ }
+
+ /** copies \p n elements from \p src to \p dest asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] dest host pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ * - \p dest points to page-locked memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
+ }
+
+ /** copies data from memory pointed by \p src to \p dest asynchronously
+ *
+ * \param[in] src host pointer
+ * \param[out] dest device pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ * - \p src points to page-locked memory
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get()));
+ }
+
+ /** copies \p n elements from \p src to \p dest asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] dest device pointer
+ * \param stream CUDA stream that has to be used for the memory transfer
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
+ }
+
+ /** sets \p n elements to \p ch in \p dest asynchronously
+ *
+ * \param[in] src device pointer
+ * \param[out] ch 8-bit value to fill the device memory with
+ * \param stream CUDA stream that has to be used for the memory operation
+ *
+ * Pre-conditions:
+ * - memory pointed by \p dest must be large enough to hold \p n elements
+ *
+ * Exception Guarantee: Basic
+ */
+ template <class T>
+ void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) {
+ if (n <= 0) {
+ CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+ }
+
+ CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get()));
+ }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
+
+#include "pointer.hpp"
+#include "nvcc_defs.hpp"
+
+#include <cstddef>
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** @brief provides non-owning mutable access for device arrays
+ *
+ * const Span<T>/Span<T> provides mutable access to the elements unless T is const qualified
+ * const Span<T> makes the span immutable but not the elements
+ */
+ template <class T>
+ class Span {
+ static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+ public:
+ using value_type = T;
+ using size_type = std::size_t;
+ using difference_type = std::ptrdiff_t;
+
+ using pointer = DevicePtr<value_type>;
+ using const_pointer = DevicePtr<typename std::add_const<value_type>::type>;
+ using reference = typename std::add_lvalue_reference<value_type>::type;
+ using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>;
+
+ using iterator = pointer;
+ using const_iterator = const_pointer;
+
+ Span() noexcept : ptr{ nullptr }, sz{ 0 } { }
+ CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { }
+ CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { }
+
+ CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; }
+ CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; }
+
+ CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; }
+ CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; }
+
+ template<class U = T, class V = typename std::add_const<U>::type,
+ typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
+ CUDA4DNN_HOST_DEVICE operator Span<V>() const noexcept { return Span<V>{ptr, sz}; }
+
+ private:
+ pointer ptr;
+ size_type sz;
+ };
+
+ /** @brief provides non-owning immutable view for device arrays */
+ template <class T>
+ using View = Span<const T>;
+
+ /** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */
+ template <class T>
+ bool is_address_aligned(View<T> v, std::size_t alignment) {
+ return is_aligned(v.data(), alignment * sizeof(T));
+ }
+
+ /** returns true if the size of a span/view is a multiple of \p alignment */
+ template <class T>
+ bool is_size_aligned(View<T> v, std::size_t alignment) {
+ return v.size() % alignment == 0;
+ }
+
+ /** @brief returns true if the address and the size of the span/view is aligned
+ * \p alignment refers to the number of elements (not bytes)
+ */
+ template <class T>
+ bool is_fully_aligned(View<T> v, std::size_t alignment) {
+ return is_address_aligned(v, alignment) && is_size_aligned(v, alignment);
+ }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
+
+#include "error.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+#include <sstream>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** @brief noncopyable smart CUDA stream
+ *
+ * UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that
+ * the handle is destroyed after use. Unless explicitly specified by a constructor argument,
+ * the stream object represents the default stream.
+ */
+ class UniqueStream {
+ public:
+ UniqueStream() noexcept : stream{ 0 } { }
+ UniqueStream(UniqueStream&) = delete;
+ UniqueStream(UniqueStream&& other) noexcept {
+ stream = other.stream;
+ other.stream = 0;
+ }
+
+ UniqueStream(bool create) : stream{ 0 } {
+ if (create) {
+ CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+ }
+ }
+
+ ~UniqueStream() {
+ try {
+ if (stream != 0)
+ CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream));
+ } catch (const CUDAException& ex) {
+ std::ostringstream os;
+ os << "Asynchronous exception caught during CUDA stream destruction.\n";
+ os << ex.what();
+ os << "Exception will be ignored.\n";
+ CV_LOG_WARNING(0, os.str().c_str());
+ }
+ }
+
+ UniqueStream& operator=(const UniqueStream&) = delete;
+ UniqueStream& operator=(UniqueStream&& other) noexcept {
+ stream = other.stream;
+ other.stream = 0;
+ return *this;
+ }
+
+ /** returns the raw CUDA stream handle */
+ cudaStream_t get() const noexcept { return stream; }
+
+ void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream)); }
+ bool busy() const {
+ auto status = cudaStreamQuery(stream);
+ if (status == cudaErrorNotReady)
+ return true;
+ CUDA4DNN_CHECK_CUDA(status);
+ return false;
+ }
+
+ private:
+ cudaStream_t stream;
+ };
+
+ /** @brief sharable smart CUDA stream
+ *
+ * Stream is a smart sharable wrapper for CUDA stream handle which ensures that
+ * the handle is destroyed after use. Unless explicitly specified by a constructor argument,
+ * the stream object represents the default stream.
+ *
+ * @note Moving a Stream object to another invalidates the former
+ */
+ class Stream {
+ public:
+ Stream() : stream(std::make_shared<UniqueStream>()) { }
+ Stream(const Stream&) = default;
+ Stream(Stream&&) = default;
+
+ /** if \p create is `true`, a new stream will be created instead of the otherwise default stream */
+ Stream(bool create) : stream(std::make_shared<UniqueStream>(create)) { }
+
+ Stream& operator=(const Stream&) = default;
+ Stream& operator=(Stream&&) = default;
+
+ /** blocks the caller thread until all operations in the stream are complete */
+ void synchronize() const { stream->synchronize(); }
+
+ /** returns true if there are operations pending in the stream */
+ bool busy() const { return stream->busy(); }
+
+ /** returns true if the stream is valid */
+ explicit operator bool() const noexcept { return static_cast<bool>(stream); }
+
+ cudaStream_t get() const noexcept {
+ CV_Assert(stream);
+ return stream->get();
+ }
+
+ private:
+ std::shared_ptr<UniqueStream> stream;
+ };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_HPP
+
+#include "nvcc_defs.hpp"
+#include "memory.hpp"
+#include "cublas.hpp"
+#include "cudnn.hpp"
+#include "span.hpp"
+
+#include "../cxx_utils/resizable_static_array.hpp"
+#include "../cxx_utils/is_iterator.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <array>
+#include <functional>
+#include <algorithm>
+#include <numeric>
+#include <iterator>
+#include <vector>
+#include <utility>
+
+#ifndef CSL_MAX_TENSOR_RANK
+ #define CSL_MAX_TENSOR_RANK 6
+#endif
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** \file tensor.hpp
+ *
+ * TYPE | OWNERSHIP | MUTABLE
+ * ------------ + --------- + --------
+ * Tensor | Yes | Yes
+ * TensorSpan | No | Yes
+ * TensorView | No | No
+ *
+ * Tensor is implicitly convertible to TensorSpan and TensorView
+ * TensorSpan is implicitly convertible to TensorView
+ *
+ * Concepts and template parameter naming convention:
+ * - "MutableTensorType" can refer to a Tensor or TensorSpan
+ * - "ImmutableTensorType" can refer to a Tensor, TensorSpan or TensorView
+ * - "TensorType" can refer to a Tensor, TensorSpan or TensorView
+ *
+ * "ImmutableTensorType" is used when the tensor data might be used.
+ * "TensorType" is used when only meta-information such as the size or shape is required, i.e. the data won't be touched
+ */
+
+ /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */
+ CUDA4DNN_HOST_DEVICE constexpr std::size_t clamp_axis(int axis, std::size_t rank) {
+ return axis < 0 ? axis + rank : axis;
+ }
+
+ /** @brief multi-dimensional contiguous non-copyable GPU tensor
+ *
+ * \tparam T type of data stored
+ *
+ * @note scalars or zero rank tensors are not supported
+ * @note the maximum rank supported is controlled by the `CSL_MAX_TENSOR_RANK` preprocessor symbol
+ */
+ template <class T>
+ class Tensor {
+ static_assert(std::is_standard_layout<T>::value, "T must staisfy StandardLayoutType");
+
+ public:
+ using value_type = typename ManagedPtr<T>::element_type;
+ using pointer = typename ManagedPtr<value_type>::pointer;
+ using const_pointer = typename ManagedPtr<value_type>::const_pointer;
+ using size_type = typename ManagedPtr<value_type>::size_type;
+
+ Tensor() noexcept { }
+ Tensor(const Tensor&) = delete;
+ Tensor(Tensor&& other) noexcept {
+ data = std::move(other.data);
+ shape = other.shape;
+ other.shape.clear();
+ }
+
+ /** @brief constructs a tensor of a specific shape
+ *
+ * Whatever arguments are accepted by the resize methods are accepted here.
+ */
+ template <class ...Args>
+ Tensor(Args&&... sizes) { resize(std::forward<Args>(sizes)...); }
+
+ Tensor& operator=(const Tensor&) = delete;
+ Tensor& operator=(Tensor&& other) noexcept {
+ data = std::move(other.data);
+ shape = other.shape;
+ other.shape.clear();
+ return *this;
+ }
+
+ /** returns true if the tensor is empty (or uninitialized) */
+ bool empty() const noexcept { return shape.size() == 0; }
+
+ /** returns the total number of elements in the tensor
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ */
+ size_type size() const noexcept {
+ CV_Assert(!empty());
+ return std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<size_type>());
+ }
+
+ /** returns the rank of the tensor
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ */
+ size_type rank() const noexcept {
+ CV_Assert(!empty());
+ return shape.size();
+ }
+
+ /** @brief returns the length of the axis
+ *
+ * Every axis is assigned a zero-based index which can be used to select an axis.
+ * Negative index can be used to select an axis from the end.
+ *
+ * Examples:
+ * > -1 represents the last axis
+ * > 0 represents the first axis
+ * > 1 represents the second axis
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ * - the axis must be in the range [-rank(), rank())
+ */
+ size_type get_axis_size(int axis) const noexcept {
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ return shape[axis];
+ }
+
+ /** @brief returns the combined size of the axes in an axis range
+ *
+ * if the shape is [3 x 5 x 7 x 11]
+ * - `size_range(0, 2)` will return 3 x 5 = 15
+ * - `size_range(1, 3)` will return 5 x 7 = 35
+ * - `size_range(0, 4)` will return 3 x 5 x 7 x 11 = 1155
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ * - `axis_start` must be less than or equal to `axis_end`
+ * - `axis_end` must be less than or equal to the rank
+ *
+ * returns one if the two `axis_start` and `axis_end` are equal
+ */
+ size_type size_range(size_type axis_start, size_type axis_end) const noexcept {
+ CV_Assert(!empty());
+ CV_Assert(axis_start <= axis_end);
+ CV_Assert(axis_end <= rank());
+ auto start = std::begin(shape) + axis_start;
+ auto end = std::begin(shape) + axis_end;
+ return std::accumulate(start, end, 1, std::multiplies<size_type>());
+ }
+
+ /** returns an std::vector containing axis lengths starting from axis zero
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ std::vector<size_type> shape_as_vector() const {
+ CV_Assert(!empty());
+ return std::vector<size_type>(std::begin(shape), std::end(shape));
+ }
+
+ /** returns a pointer to mutable device memory owned by the tensor */
+ pointer get() noexcept { return data.get(); }
+
+ /** returns a pointer to immutable device memory owned by the tensor */
+ const_pointer get() const noexcept { return data.get(); }
+
+ /** @brief releases the memory owned by the tensor
+ *
+ * Pre-conditions:
+ * - tensor must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ void clear() {
+ CV_Assert(!empty());
+ data.reset();
+ shape.clear();
+ }
+
+ /** @brief resizes the tensor
+ *
+ * Pre-conditions:
+ * - [start, end) represents a forward range containing the length of the axes in order starting from axis zero
+ * - number of lengths provided must not exceed the maximum tensor rank (CSL_MAX_TENSOR_RANK)
+ * - the sizes must be positive integers
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<cxx_utils::is_forward_iterator<ForwardItr>::value, void>
+ ::type resize(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= CSL_MAX_TENSOR_RANK);
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ data.reset(total);
+
+ shape.assign(start, end);
+ }
+
+ /** @brief resizes the tensor
+ * constructs a range out of the arguments and invokes the range-based resize method
+ */
+ template <class ...Sizes>
+ void resize(Sizes... new_sizes_) {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "required rank exceeds maximum supported rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<size_type, sizeof...(Sizes)> new_sizes = { static_cast<size_type>(new_sizes_)... };
+ resize(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief resizes the tensor
+ *
+ * Pre-conditions:
+ * - the reference tensor must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class TensorType>
+ void resize_as(const TensorType& tensor) {
+ CV_Assert(!tensor.empty());
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> new_sizes(tensor.rank());
+ for (int i = 0; i < new_sizes.size(); i++)
+ new_sizes[i] = tensor.get_axis_size(i);
+ resize(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief reshapes the tensor
+ *
+ * Length deduction:
+ * The length of at most one axis can be deduced using the total size constraint. The axis can
+ * be marked for deduction by specifying the size as -1.
+ *
+ * The axes for which no size was provided (excluding -1) will be assumed to be one.
+ *
+ * Pre-conditions:
+ * - the tensor must be non-empty
+ * - [start, end) represents a forward range containing the length of the axes starting from axis zero
+ * - the number of lengths provided must be less than or equal to the tensor rank
+ * - at most one axis length is allowed for length deduction
+ * - the lengths provided must ensure that the total number of elements remains unchanged
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<cxx_utils::is_forward_iterator<ForwardItr>::value, void>
+ ::type reshape(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= rank());
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+
+ /* the user may leave at most one axis size for deduction by specifying -1 */
+ auto sizes_to_deduce = std::count(start, end, -1);
+ if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); }
+
+ /* sizes must be positive numbers with the exception of -1 */
+ auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) {
+ return !(x > 0 || x == -1);
+ });
+ if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); }
+
+ /* compute the total number of elements in the new tensor */
+ size_type unknown_size = 0;
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ if (total < 0) {
+ /* there is an unknown size */
+ if (std::abs(total) <= size()) {
+ unknown_size = size() / std::abs(total);
+ total = size();
+ }
+ /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible
+ ** Since `total` is negative, the size check which follows will fail and throw an error
+ */
+ }
+
+ /* the number of elements before and after reshape must be exactly same */
+ if (total != size()) {
+ CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count");
+ }
+
+ /* we assume the size of the unspecified axes to be one */
+ std::fill(std::begin(shape), std::end(shape), 1);
+ std::copy_backward(start, end, std::end(shape));
+
+ /* replace the unknown axis with the correct value */
+ std::replace(std::begin(shape), std::end(shape), size_type(-1), unknown_size);
+ }
+
+ /** @brief reshapes the tensor
+ * constructs a range out of the arguments and invokes range-based reshape method
+ */
+ template <class ...Sizes>
+ void reshape(Sizes... new_sizes_) {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "required rank exceeds maximum supported rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<std::int64_t, sizeof...(Sizes)> new_sizes = { static_cast<std::int64_t>(new_sizes_)... };
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief reshapes the tensor
+ *
+ * Pre-conditions:
+ * - the reference tensor must be a non-empty tensor
+ * - the reference tensor's rank must be lesser than or equal to the rank of target tensor
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class TensorType>
+ void reshape_as(const TensorType& tensor) {
+ CV_Assert(!tensor.empty());
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> new_sizes(tensor.rank());
+ for (int i = 0; i < new_sizes.size(); i++)
+ new_sizes[i] = tensor.get_axis_size(i);
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes all axes of unit size
+ *
+ * Pre-conditions:
+ * - the tensor must be non-empty
+ * - the tensor's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze() {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ auto itr = std::remove(std::begin(shape), std::end(shape), 1);
+ shape.resize(itr - std::begin(shape));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes the specified axis if the axis length is one; otherwise, ignores the request
+ *
+ * Pre-conditions:
+ * - the tensor must be non-empty
+ * - the tensor's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze(int axis) {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.erase(std::begin(shape) + axis);
+ }
+
+ /** @brief unsqueezes the tensor
+ *
+ * adds a axis of unit size at the requested before the specified axis
+ *
+ * Pre-conditions:
+ * - the tensor must be non-empty
+ * - the tensor's rank must be less than the maximum supported rank (CSL_MAX_TENSOR_RANK)
+ *
+ * Exception Guarantee: Strong
+ */
+ void unsqueeze(int axis = 0) {
+ CV_Assert(!empty());
+ CV_Assert(rank() < CSL_MAX_TENSOR_RANK);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.insert(std::begin(shape) + axis, 1);
+ }
+
+ operator Span<T>() noexcept { return Span<T>(data.get(), size()); }
+ operator View<T>() const noexcept { return View<T>(data.get(), size()); }
+
+ friend void swap(Tensor& lhs, Tensor& rhs) noexcept {
+ using std::swap;
+ swap(lhs.data, rhs.data);
+ swap(lhs.shape, rhs.shape);
+ }
+
+ private:
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> shape;
+ ManagedPtr<value_type> data;
+ };
+
+ /** @brief provides a non-owning mutable span of a Tensor
+ *
+ * \tparam T type of data stored by the tensor
+ *
+ * A span is valid if and only if the following hold true:
+ * - span is non-empty
+ * - spanned memory is still allocated
+ *
+ * A span may be used if and only if it is valid.
+ */
+ template <class T>
+ class TensorSpan {
+ public:
+ using value_type = typename Tensor<T>::value_type;
+ using pointer = typename Tensor<T>::pointer;
+ using const_pointer = typename Tensor<T>::const_pointer;
+ using size_type = typename Tensor<T>::size_type;
+
+ TensorSpan() noexcept : ptr{ nullptr } { }
+ TensorSpan(const TensorSpan&) noexcept = default;
+ TensorSpan(Tensor<T>& tensor) noexcept : ptr{ tensor.get() } {
+ const auto rank = tensor.rank();
+ shape.resize(rank);
+ for (int i = 0; i < rank; i++)
+ shape[i] = tensor.get_axis_size(i);
+ }
+
+ template <class ForwardItr>
+ TensorSpan(pointer ptr_, ForwardItr start, ForwardItr end) : ptr{ ptr_ } {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= CSL_MAX_TENSOR_RANK);
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+ if (std::any_of(start, end, [](ItrValueType x) { return x <= 0; })) {
+ CV_Error(Error::StsBadArg, "the given shape contains negative or zero size");
+ }
+
+ shape.assign(start, end);
+ }
+
+ /** creates a subspan of a tensor (or span); refer to subspan method for more details */
+ template <class... Args>
+ TensorSpan(TensorSpan other, size_type offset, Args&&... args)
+ : TensorSpan(other.subspan(offset, std::forward<Args>(args)...)) { }
+
+ /** returns true if the span is empty */
+ bool empty() const noexcept { return shape.size() == 0; }
+
+ /** returns the total number of elements in the span
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ */
+ size_type size() const noexcept {
+ CV_Assert(!empty());
+ return std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<size_type>());
+ }
+
+ /** returns the rank of the span
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ */
+ size_type rank() const noexcept {
+ CV_Assert(!empty());
+ return shape.size();
+ }
+
+ /** @brief returns the length of the axis
+ *
+ * Every axis is assigned a zero-based index which can be used to select an axis.
+ * Negative index can be used to select an axis from the end.
+ *
+ * Examples:
+ * > -1 represents the last axis
+ * > 0 represents the first axis
+ * > 1 represents the second axis
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ * - the axis must be in the range [-rank(), rank())
+ */
+ size_type get_axis_size(int axis) const noexcept {
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ return shape[axis];
+ }
+
+ /** @brief returns the combined size of the axes in an axis range
+ *
+ * if the shape is [3 x 5 x 7 x 11]
+ * - `size_range(0, 2)` will return 3 x 5 = 15
+ * - `size_range(1, 3)` will return 5 x 7 = 35
+ * - `size_range(0, 4)` will return 3 x 5 x 7 x 11 = 1155
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ * - `axis_start` must be less than or equal to `axis_end`
+ * - `axis_end` must be less than or equal to the rank
+ *
+ * returns one if the two `axis_start` and `axis_end` are equal
+ */
+ size_type size_range(size_type axis_start, size_type axis_end) const noexcept {
+ CV_Assert(!empty());
+ CV_Assert(axis_start <= axis_end);
+ CV_Assert(axis_end <= rank());
+ auto start = std::begin(shape) + axis_start;
+ auto end = std::begin(shape) + axis_end;
+ return std::accumulate(start, end, 1, std::multiplies<size_type>());
+ }
+
+ /** returns an std::vector containing axis lengths starting from axis zero
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ std::vector<size_type> shape_as_vector() const {
+ CV_Assert(!empty());
+ return std::vector<size_type>(std::begin(shape), std::end(shape));
+ }
+
+ /** returns a pointer to mutable device memory */
+ pointer get() const noexcept { return ptr; }
+
+ /** @brief clears the span
+ *
+ * Pre-conditions:
+ * - span must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ void clear() noexcept {
+ CV_Assert(!empty());
+ ptr = nullptr;
+ shape.clear();
+ }
+
+ /** @brief reshapes the span
+ *
+ * Length deduction:
+ * The length of at most one axis can be deduced using the total size constraint. The axis can
+ * be marked for deduction by specifying the corresponding size as -1.
+ *
+ * The axes for which no size was provided (excluding -1) will be assumed to be one.
+ *
+ * Pre-conditions:
+ * - the span must be non-empty
+ * - [start, end) represents a forward range containing the length of the axes in order
+ * - the number of axis lengths must be less than or equal to the rank
+ * - at most one axis length is allowed for length deduction
+ * - the lengths provided must ensure that the total number of elements remains unchnged
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<cxx_utils::is_forward_iterator<ForwardItr>::value, void>
+ ::type reshape(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= rank());
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+
+ /* the user may leave at most one axis size for deduction by specifying -1 */
+ auto sizes_to_deduce = std::count(start, end, -1);
+ if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); }
+
+ /* sizes must be positive numbers with the exception of -1 */
+ auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) {
+ return !(x > 0 || x == -1);
+ });
+ if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); }
+
+ /* compute the total number of elements in the new tensor */
+ size_type unknown_size = 0;
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ if (total < 0) {
+ /* there is an unknown size */
+ if (std::abs(total) <= size()) {
+ unknown_size = size() / std::abs(total);
+ total = size();
+ }
+ /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible
+ ** Since `total` is negative, the size check which follows will fail and throw an error
+ */
+ }
+
+ /* the number of elements before and after reshape must be exactly same */
+ if (total != size()) {
+ CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count");
+ }
+
+ /* we assume the size of the unspecified axes to be one */
+ std::fill(std::begin(shape), std::end(shape), 1);
+ std::copy_backward(start, end, std::end(shape));
+
+ /* replace the unknown axis with the correct value */
+ std::replace(std::begin(shape), std::end(shape), size_type(-1), unknown_size);
+ }
+
+ /** @brief reshapes the tensor
+ * constructs a range out of the arguments and invokes the range-based reshape method
+ */
+ template <class ...Sizes>
+ void reshape(Sizes... new_sizes_) {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "unsupported tensor rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<std::int64_t, sizeof...(Sizes)> new_sizes = { static_cast<std::int64_t>(new_sizes_)... };
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief reshapes the span
+ *
+ * Pre-conditions:
+ * - the reference tensor/span/view must be non-empty
+ * - the reference tensor/span/view's rank must be less than or equal to the rank of the span
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class TensorType>
+ void reshape_as(const TensorType& tensor) {
+ CV_Assert(!tensor.empty());
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> new_sizes(tensor.rank());
+ for (int i = 0; i < new_sizes.size(); i++)
+ new_sizes[i] = tensor.get_axis_size(i);
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes all axes of unit size
+ *
+ * Pre-conditions:
+ * - the span must be non-empty
+ * - the span's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze() {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ auto itr = std::remove(std::begin(shape), std::end(shape), 1);
+ shape.resize(itr - std::begin(shape));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes the specified axis if the axis length is one; otherwise, ignores the request
+ *
+ * Pre-conditions:
+ * - the span must be non-empty
+ * - the span's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze(int axis) {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.erase(std::begin(shape) + axis);
+ }
+
+ /** @brief unsqueezes the tensor
+ *
+ * adds a axis of unit size at the requested before the specified axis
+ *
+ * Pre-conditions:
+ * - the span must be non-empty
+ * - the span's rank must be less than the maximum supported rank (CSL_MAX_TENSOR_RANK)
+ *
+ * Exception Guarantee: Strong
+ */
+ void unsqueeze(int axis = 0) {
+ CV_Assert(!empty());
+ CV_Assert(rank() < CSL_MAX_TENSOR_RANK);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.insert(std::begin(shape) + axis, 1);
+ }
+
+ /** @brief obtains a subspan of the span
+ *
+ * Pre-conditions:
+ * - the span must be non-empty
+ * - the `offset` must be less than the size of the span
+ * - [start, end) represents a forward range containing length of the subspan axes
+ * - the lengths provided must ensure that the number of elements does not exceed (old size - offset)
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<cxx_utils::is_forward_iterator<ForwardItr>::value, TensorSpan>
+ ::type subspan(size_type offset, ForwardItr start, ForwardItr end) const {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= rank());
+
+ auto cur_size = size();
+ CV_Assert(offset < cur_size);
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+
+ /* sizes must be positive numbers */
+ auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) {
+ return !(x > 0);
+ });
+ if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); }
+
+ /* the number of elements must be equal to the new size */
+ auto max_size = (cur_size - offset);
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ if (total > max_size) {
+ CV_Error(Error::StsBadArg, "axis lengths lead to OOB accesses");
+ }
+
+ TensorSpan temp;
+ temp.shape.assign(start, end);
+ temp.ptr = ptr + offset;
+ return temp;
+ }
+
+ /** @brief obtains a subspan of the span
+ * constructs a range out of the size arguments and invokes the range-based subspan method
+ */
+ template <class ...Sizes>
+ TensorSpan subspan(size_type offset, Sizes... new_sizes_) const {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "required rank exceeds maximum supported rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<std::int64_t, sizeof...(Sizes)> new_sizes = { static_cast<std::int64_t>(new_sizes_)... };
+ return subspan(offset, std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ operator Span<T>() noexcept { return Span<T>(ptr, size()); }
+ operator View<T>() const noexcept { return View<T>(ptr, size()); }
+
+ friend void swap(TensorSpan& lhs, TensorSpan& rhs) noexcept {
+ using std::swap;
+ swap(lhs.ptr, rhs.ptr);
+ swap(lhs.shape, rhs.shape);
+ }
+
+ private:
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> shape;
+ pointer ptr;
+ };
+
+ /** @brief view of a tensor
+ *
+ * \tparam T type of data stored by the tensor
+ *
+ * A view is valid if and only if the following hold true:
+ * - view is non-empty
+ * - viewed memory is still allocated
+ */
+ template <class T>
+ class TensorView {
+ public:
+ using value_type = typename Tensor<T>::value_type;
+ using pointer = typename Tensor<T>::pointer;
+ using const_pointer = typename Tensor<T>::const_pointer;
+ using size_type = typename Tensor<T>::size_type;
+
+ TensorView() noexcept : ptr{ nullptr } { }
+ TensorView(const TensorView&) noexcept = default;
+ TensorView(TensorSpan<T> other) noexcept : ptr{ other.get() } {
+ const auto rank = other.rank();
+ shape.resize(rank);
+ for (int i = 0; i < rank; i++)
+ shape[i] = other.get_axis_size(i);
+ }
+ TensorView(const Tensor<T>& tensor) noexcept : ptr{ tensor.get() } {
+ const auto rank = tensor.rank();
+ shape.resize(rank);
+ for (int i = 0; i < rank; i++)
+ shape[i] = tensor.get_axis_size(i);
+ }
+
+ template <class ForwardItr>
+ TensorView(pointer ptr_, ForwardItr start, ForwardItr end) : ptr{ ptr_ } {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= CSL_MAX_TENSOR_RANK);
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+ if (std::any_of(start, end, [](ItrValueType x) { return x <= 0; })) {
+ CV_Error(Error::StsBadArg, "the given shape contains negative or zero size");
+ }
+
+ shape.assign(start, end);
+ }
+
+ /** creates a subview of a tensor (or span or view); refer to subview method for more details */
+ template <class... Args>
+ TensorView(TensorView other, size_type offset, Args&&... args) noexcept
+ : TensorView(other.subview(offset, std::forward<Args>(args)...)) { }
+
+ TensorView& operator=(const TensorView&) = default;
+ TensorView& operator=(TensorSpan<T> other) noexcept {
+ TensorView tmp(other);
+ swap(*this, tmp);
+ return *this;
+ }
+
+ /** returns true if the view is empty */
+ bool empty() const noexcept { return shape.size() == 0; }
+
+ /** returns the total number of elements in the view
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ */
+ size_type size() const noexcept {
+ CV_Assert(!empty());
+ return std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<size_type>());
+ }
+
+ /** returns the rank of the view
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ */
+ size_type rank() const noexcept {
+ CV_Assert(!empty());
+ return shape.size();
+ }
+
+ /** @brief returns the length of the axis
+ *
+ * Every axis is assigned a zero-based index which can be used to select an axis.
+ * Negative index can be used to select an axis from the end.
+ *
+ * Examples:
+ * > -1 represents the last axis
+ * > 0 represents the first axis
+ * > 1 represents the second axis
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ * - the axis must be in the range [-rank(), rank())
+ */
+ size_type get_axis_size(int axis) const noexcept {
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ return shape[axis];
+ }
+
+ /** @brief returns the combined size of the axes in an axis range
+ *
+ * if the shape is [3 x 5 x 7 x 11]
+ * - `size_range(0, 2)` will return 3 x 5 = 15
+ * - `size_range(1, 3)` will return 5 x 7 = 35
+ * - `size_range(0, 4)` will return 3 x 5 x 7 x 11 = 1155
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ * - `axis_start` must be less than or equal to `axis_end`
+ * - `axis_end` must be less than or equal to the rank
+ *
+ * returns one if the two `axis_start` and `axis_end` are equal
+ */
+ size_type size_range(size_type axis_start, size_type axis_end) const noexcept {
+ CV_Assert(!empty());
+ CV_Assert(axis_start <= axis_end);
+ CV_Assert(axis_end <= rank());
+ auto start = std::begin(shape) + axis_start;
+ auto end = std::begin(shape) + axis_end;
+ return std::accumulate(start, end, 1, std::multiplies<size_type>());
+ }
+
+ /** returns an std::vector containing axis lengths starting from axis zero
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ *
+ * Exception Guarantee: Strong
+ */
+ std::vector<size_type> shape_as_vector() const {
+ CV_Assert(!empty());
+ return std::vector<size_type>(std::begin(shape), std::end(shape));
+ }
+
+ /** returns a device pointer to immutable device memory */
+ const_pointer get() const noexcept { return ptr; }
+
+ /** @brief reshapes the view
+ *
+ * Length deduction:
+ * The length of at most one axis can be deduced using the total size constraint. The axis can
+ * be marked for deduction by specifying the size as -1.
+ *
+ * The axes for which no size was provided (excluding -1) will be assumed to be one.
+ *
+ * Pre-conditions:
+ * - view must be non-empty
+ * - [start, end) represents a forward range containing length of the axes in order starting from axis zero
+ * - the number of axis lengths must be less than or equal to the tensor rank
+ * - at most one axis length is allowed for length deduction
+ * - the lengths provided must ensure that the total number of elements remains unchnged
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<!std::is_integral<ForwardItr>::value, void>
+ ::type reshape(ForwardItr start, ForwardItr end) {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= rank());
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+
+ /* the user may leave at most one axis size for deduction by specifying -1 */
+ auto sizes_to_deduce = std::count(start, end, -1);
+ if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); }
+
+ /* sizes must be positive numbers with the exception of -1 */
+ auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) {
+ return !(x > 0 || x == -1);
+ });
+ if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); }
+
+ /* compute the total number of elements in the new tensor */
+ size_type unknown_size = 0;
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ if (total < 0) {
+ /* there is an unknown size */
+ if (std::abs(total) <= size()) {
+ unknown_size = size() / std::abs(total);
+ total = size();
+ }
+ /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible
+ ** Since `total` is negative, the size check which follows will fail and throw an error
+ */
+ }
+
+ /* the number of elements before and after reshape must be exactly same */
+ if (total != size()) {
+ CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count");
+ }
+
+ /* we assume the size of the unspecified axes to be one */
+ std::fill(std::begin(shape), std::end(shape), 1);
+ std::copy_backward(start, end, std::end(shape));
+
+ /* replace the unknown axis with the correct value */
+ std::replace(std::begin(shape), std::end(shape), size_type(-1), unknown_size);
+ }
+
+ /** @brief reshapes the view
+ * constructs a range out of the arguments and invokes the range-based reshape method
+ */
+ template <class ...Sizes>
+ void reshape(Sizes... new_sizes_) {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "required rank exceeds maximum supported rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<std::int64_t, sizeof...(Sizes)> new_sizes = { static_cast<std::int64_t>(new_sizes_)... };
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief reshapes the view
+ *
+ * Pre-conditions:
+ * - the reference tensor/span/view must be non-empty
+ * - the reference tensor/span/view's rank must be less than or equal to the rank of the view
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class TensorType>
+ void reshape_as(const TensorType& tensor) {
+ CV_Assert(!tensor.empty());
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> new_sizes(tensor.rank());
+ for (int i = 0; i < new_sizes.size(); i++)
+ new_sizes[i] = tensor.get_axis_size(i);
+ reshape(std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes all axes of unit size
+ *
+ * Pre-conditions:
+ * - the view must be non-empty
+ * - the view's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze() {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ auto itr = std::remove(std::begin(shape), std::end(shape), 1);
+ shape.resize(itr - std::begin(shape));
+ }
+
+ /** @brief squeezes the tensor
+ *
+ * removes the specified axis if the axis length is one; otherwise, ignores the request
+ *
+ * Pre-conditions:
+ * - the view must be non-empty
+ * - the view's rank must be at least two
+ *
+ * Exception Guarantee: Strong
+ */
+ void squeeze(int axis) {
+ CV_Assert(!empty());
+ CV_Assert(rank() >= 2);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.erase(std::begin(shape) + axis);
+ }
+
+ /** @brief unsqueezes the tensor
+ *
+ * adds a axis of unit size at the requested before the specified axis
+ *
+ * Pre-conditions:
+ * - the view must be non-empty
+ * - the view's rank must be less than the maximum supported rank (CSL_MAX_TENSOR_RANK)
+ *
+ * Exception Guarantee: Strong
+ */
+ void unsqueeze(int axis = 0) {
+ CV_Assert(!empty());
+ CV_Assert(rank() < CSL_MAX_TENSOR_RANK);
+ axis = clamp_axis(axis, rank());
+ CV_Assert(axis >= 0 && axis < rank());
+ shape.insert(std::begin(shape) + axis, 1);
+ }
+
+ /** @brief obtains a subview of the view
+ *
+ * The axes for which no size was provided will be assumed to be one.
+ *
+ * Pre-conditions:
+ * - the view must be non-empty
+ * - the `offset` must be less than the size of the view
+ * - [start, end) represents a forward range containing length of the subview axes in order
+ * - the number of axis lengths provided must be less than or equal to the tensor rank
+ * - the lengths provided must ensure that the number of elements does not exceed (old size - offset)
+ *
+ * Exception Guarantee: Strong
+ */
+ template <class ForwardItr>
+ typename std::enable_if<cxx_utils::is_forward_iterator<ForwardItr>::value, TensorView>
+ ::type subview(size_type offset, ForwardItr start, ForwardItr end) const {
+ CV_Assert(start != end);
+ CV_Assert(std::distance(start, end) <= rank());
+
+ auto cur_size = size();
+ CV_Assert(offset < cur_size);
+
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+
+ /* sizes must be positive numbers */
+ auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) {
+ return !(x > 0);
+ });
+ if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); }
+
+ /* the number of elements must be equal to the new size */
+ auto max_size = (cur_size - offset);
+ auto total = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ if (total > max_size) {
+ CV_Error(Error::StsBadArg, "axes lengths lead to OOB accesses");
+ }
+
+ TensorView temp;
+ temp.shape.assign(start, end);
+ temp.ptr = ptr + offset;
+ return temp;
+ }
+
+ /** @brief obtains a subview of the view
+ * constructs a range out of the size arguments and invokes the range-based subview method
+ */
+ template <class ...Sizes>
+ TensorView subview(size_type offset, Sizes... new_sizes_) const {
+ static_assert(sizeof...(Sizes) <= CSL_MAX_TENSOR_RANK, "required rank exceeds maximum supported rank");
+ static_assert(sizeof...(Sizes) > 0, "no sizes provided");
+ std::array<std::int64_t, sizeof...(Sizes)> new_sizes = { static_cast<std::int64_t>(new_sizes_)... };
+ return subview(offset, std::begin(new_sizes), std::end(new_sizes));
+ }
+
+ operator View<T>() const noexcept { return View<T>(ptr, size()); }
+
+ friend void swap(TensorView& lhs, TensorView& rhs) noexcept {
+ using std::swap;
+ swap(lhs.ptr, rhs.ptr);
+ swap(lhs.shape, rhs.shape);
+ }
+
+ private:
+ cxx_utils::resizable_static_array<size_type, CSL_MAX_TENSOR_RANK> shape;
+ const_pointer ptr;
+ };
+
+ /** returns true if the two TensorType objects have the same shape */
+ template <class TensorType1, class TensorType2>
+ bool is_shape_same(const TensorType1& x, const TensorType2& y) noexcept {
+ auto rank1 = x.rank();
+ auto rank2 = y.rank();
+
+ if (rank1 != rank2)
+ return false;
+
+ for (int i = 0; i < rank1; i++)
+ if (x.get_axis_size(i) != y.get_axis_size(i))
+ return false;
+ return true;
+ }
+
+ /** returns true if the two TensorType objects are compatible */
+ template <class TensorType1, class TensorType2>
+ bool is_shape_compatible(const TensorType1& x, const TensorType2& y) noexcept {
+ const auto rank1 = x.rank();
+ const auto rank2 = y.rank();
+
+ /* mathematically not required but is a technically required */
+ if (rank1 != rank2)
+ return false;
+
+ for (int i = 0; i < rank1; i++)
+ if (x.get_axis_size(i) != y.get_axis_size(i) &&
+ x.get_axis_size(i) != 1 && y.get_axis_size(i) != 1)
+ return false;
+ return true;
+ }
+
+ /** returns the rank to which the given tensor can be squeezed to */
+ template <class TensorType>
+ std::size_t get_effective_rank(const TensorType& x) noexcept {
+ const auto rank = x.rank();
+ auto effective_rank = rank;
+ for (int i = 0; i < rank; i++, effective_rank--)
+ if (x.get_axis_size(i) != 1)
+ break;
+ return effective_rank;
+ }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
+
+#include "stream.hpp"
+#include "tensor.hpp"
+#include "pointer.hpp"
+#include "cublas.hpp"
+#include "cudnn.hpp"
+#include "workspace.hpp"
+
+#include "cudnn/convolution.hpp"
+#include "cudnn/pooling.hpp"
+#include "cudnn/lrn.hpp"
+#include "cudnn/softmax.hpp"
+#include "cudnn/transform.hpp"
+#include "cudnn/transpose_convolution.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <array>
+#include <vector>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ namespace tensor_ops {
+
+ /** @brief copies data between tensors
+ *
+ * Pre-conditions:
+ * - \p dest and \p src must have the same shape
+ *
+ * Exception Gaurantee: Basic
+ */
+ template <class T> inline
+ void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
+ CV_Assert(is_shape_same(dest, src));
+ if (dest.get() != src.get())
+ memcpy(dest.get(), src.get(), dest.size(), stream);
+ }
+
+ /** @brief performs generalized matrix-multiplication
+ *
+ * Pre-conditions:
+ * - \p A and \p B must meet the mathematical requirements for matrix multiplication
+ * - \p result must be large enough to hold the result
+ *
+ * Exception Gaurantee: Basic
+ */
+ template <class T> inline
+ void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
+ /* matrix operations can be performed only on rank two or less tensors */
+ CV_Assert(get_effective_rank(A) <= 2 &&
+ get_effective_rank(B) <= 2 &&
+ get_effective_rank(result) <= 2);
+
+ /* check dimension requirements for matrix multiplication */
+ if (!transa && !transb) {
+ CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
+ CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
+ CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
+ } else if (!transa && transb) {
+ CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
+ CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
+ CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
+ } else if (transa && !transb) {
+ CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
+ CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
+ CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
+ } else {
+ CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
+ CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
+ CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
+ }
+
+ const auto result_nr = result.get_axis_size(-2);
+ const auto result_nc = result.get_axis_size(-1);
+ const auto common_dim = A.get_axis_size(transa ? -2 : -1);
+ const auto A_nc = A.get_axis_size(-1);
+ const auto B_nc = B.get_axis_size(-1);
+
+ /* tensors are stored in row-major but cublas::gemm operates on column-major matrices
+ * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
+ *
+ * Required: C = AB
+ * what cuBLAS sees: C^T = A^TB^T = (BA)^T
+ *
+ * By reversing operands, we effectively perform:
+ * C^T = B^TA^T = (AB)^T
+ *
+ * which gives C = AB
+ */
+ cublas::gemm<T>(handle,
+ transb, transa,
+ result_nc, result_nr, common_dim,
+ alpha, B.get(), B_nc,
+ A.get(), A_nc,
+ beta, result.get(), result_nc);
+ }
+
+ /** @brief performs element-wise addition with broadcasting
+ *
+ * Pre-conditions:
+ * - \p A and \p result must be compatible tensors
+ *
+ * Exception Gaurantee: Basic
+ */
+ template <class T> inline
+ void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
+ CV_Assert(is_shape_same(output, input));
+
+ channel_axis = clamp_axis(channel_axis, input.rank());
+
+ std::size_t outer_size = input.size_range(0, channel_axis);
+ auto channel_size = input.get_axis_size(channel_axis);
+ std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
+
+ std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
+
+ using cudnn::TensorDescriptor;
+ auto inputDesc = TensorDescriptor<T>(shape);
+ auto outputDesc = TensorDescriptor<T>(shape);
+ cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
+ }
+ }
+
+ template <class T>
+ class Convolution {
+ using TensorDescriptor = cudnn::TensorDescriptor<T>;
+ using FilterDescriptor = cudnn::FilterDescriptor<T>;
+ using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
+ using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
+
+ public:
+ struct params_type {
+ std::vector<std::size_t> input_shape;
+ std::vector<std::size_t> filter_shape;
+
+ std::vector<std::size_t> padding;
+ std::vector<std::size_t> stride;
+ std::vector<std::size_t> dilation;
+
+ std::size_t groups;
+ };
+
+ Convolution() = default;
+ Convolution(const Convolution&) = delete;
+ Convolution(Convolution&&) = default;
+ Convolution(cudnn::Handle handle, const params_type& params) {
+ cudnnHandle = std::move(handle);
+
+ inputTensorDesc = TensorDescriptor(params.input_shape);
+ filterDesc = FilterDescriptor(params.filter_shape);
+ convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
+
+ std::vector<int> output_dims;
+ getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
+ outputTensorDesc = TensorDescriptor(output_dims);
+
+ algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
+ }
+
+ Convolution& operator=(const Convolution&) = delete;
+ Convolution& operator=(Convolution&&) = default;
+
+ std::size_t get_workspace_size() const noexcept {
+ return algo.get_workspace_size();
+ }
+
+ void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
+ cudnn::convolve<T>(
+ cudnnHandle,
+ convDesc, algo, scratchpad,
+ filterDesc, filters.get(),
+ inputTensorDesc, input.get(),
+ 1.0, 0.0, outputTensorDesc, output.get()
+ );
+ }
+
+ private:
+ cudnn::Handle cudnnHandle;
+ TensorDescriptor inputTensorDesc, outputTensorDesc;
+ FilterDescriptor filterDesc;
+ ConvolutionDescriptor convDesc;
+ ConvolutionAlgorithm algo;
+ };
+
+ template <class T>
+ class TransposeConvolution {
+ using TensorDescriptor = cudnn::TensorDescriptor<T>;
+ using FilterDescriptor = cudnn::FilterDescriptor<T>;
+ using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
+ using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
+
+ public:
+ struct params_type {
+ std::vector<std::size_t> input_shape;
+ std::vector<std::size_t> output_shape;
+
+ std::vector<std::size_t> filter_shape;
+
+ std::vector<std::size_t> padding;
+ std::vector<std::size_t> stride;
+ std::vector<std::size_t> dilation;
+
+ std::size_t groups;
+ };
+
+ TransposeConvolution() = default;
+ TransposeConvolution(const TransposeConvolution&) = delete;
+ TransposeConvolution(TransposeConvolution&&) = default;
+ TransposeConvolution(cudnn::Handle handle, const params_type& params) {
+ cudnnHandle = std::move(handle);
+
+ filterDesc = FilterDescriptor(params.filter_shape);
+ convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
+
+ /* input_shape is the output shape for convolution
+ * output_shape is the input shape for convolution
+ */
+ convInputTensorDesc = TensorDescriptor(params.output_shape);
+
+ std::vector<int> conv_output_dims;
+ getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
+
+ /* the convolution output must be identical to what cuDNN expects */
+ CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
+
+ convOutputTensorDesc = TensorDescriptor(params.input_shape);
+
+ algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
+ }
+
+ TransposeConvolution& operator=(const TransposeConvolution&) = delete;
+ TransposeConvolution& operator=(TransposeConvolution&&) = default;
+
+ std::size_t get_workspace_size() const noexcept {
+ return algo.get_workspace_size();
+ }
+
+ void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
+ cudnn::transpose_convolve<T>(
+ cudnnHandle,
+ convDesc, algo, scratchpad,
+ filterDesc, filters.get(),
+ convOutputTensorDesc, input.get(),
+ 1.0, 0.0, convInputTensorDesc, output.get()
+ );
+ }
+
+ private:
+ cudnn::Handle cudnnHandle;
+ TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
+ FilterDescriptor filterDesc;
+ ConvolutionDescriptor convDesc;
+ TransposeConvolutionAlgorithm algo;
+ };
+
+ template <class T>
+ class Pooling {
+ using TensorDescriptor = cudnn::TensorDescriptor<T>;
+ using PoolingDescriptor = cudnn::PoolingDescriptor;
+
+ public:
+ using PoolingType = PoolingDescriptor::PoolingType;
+
+ struct params_type {
+ std::vector<std::size_t> input_shape;
+ std::vector<std::size_t> output_shape;
+
+ std::vector<std::size_t> window_size;
+ std::vector<std::size_t> padding;
+ std::vector<std::size_t> stride;
+
+ PoolingType type;
+ };
+
+ Pooling() = default;
+ Pooling(const Pooling&) = delete;
+ Pooling(Pooling&&) = default;
+ Pooling(cudnn::Handle handle, const params_type& params) {
+ cudnnHandle = std::move(handle);
+
+ inputTensorDesc = TensorDescriptor(params.input_shape);
+ poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
+
+ //std::vector<int> output_dim;
+ //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
+ outputTensorDesc = TensorDescriptor(params.output_shape);
+ }
+
+ Pooling& operator=(const Pooling&) = delete;
+ Pooling& operator=(Pooling&&) = default;
+
+ void pool(TensorView<T> input, TensorSpan<T> output) {
+ cudnn::pool<T>(
+ cudnnHandle,
+ poolingDesc,
+ inputTensorDesc, input.get(),
+ 1.0, 0.0, outputTensorDesc, output.get()
+ );
+ }
+
+ private:
+ cudnn::Handle cudnnHandle;
+ TensorDescriptor inputTensorDesc, outputTensorDesc;
+ PoolingDescriptor poolingDesc;
+ };
+
+ template <class T>
+ class LRN {
+ using LRNDescriptor = cudnn::LRNDescriptor;
+ using TensorDescriptor = cudnn::TensorDescriptor<T>;
+
+ public:
+ using LRNType = LRNDescriptor::LRNType;
+
+ LRN() = default;
+ LRN(const LRN&) = delete;
+ LRN(LRN&&) = default;
+ LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
+ cudnnHandle = std::move(handle);
+ lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
+ }
+
+ LRN& operator=(const LRN&) = delete;
+ LRN& operator=(LRN&&) = default;
+
+ void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
+ cudnn::LRNForward<T>(
+ cudnnHandle,
+ lrnDesc,
+ TensorDescriptor(input.shape_as_vector()), input.get(),
+ 1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
+ workspace
+ );
+ }
+
+ private:
+ cudnn::Handle cudnnHandle;
+ LRNDescriptor lrnDesc;
+ };
+
+ template <class T>
+ class TensorTransform {
+ using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
+ using TensorDescriptor = cudnn::TensorDescriptor<T>;
+
+ public:
+ TensorTransform() = default;
+ TensorTransform(const TensorTransform&) = delete;
+ TensorTransform(TensorTransform&&) = default;
+
+ template <class SequenceContainer>
+ TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
+ cudnnHandle = std::move(handle);
+ transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
+ }
+
+ TensorTransform& operator=(const TensorTransform&) = delete;
+ TensorTransform& operator=(TensorTransform&&) = default;
+
+ void transform(TensorView<T> input, TensorSpan<T> output) {
+ cudnn::transform<T>(
+ cudnnHandle,
+ transDesc,
+ TensorDescriptor(input.shape_as_vector()), input.get(),
+ TensorDescriptor(output.shape_as_vector()), output.get()
+ );
+ }
+
+ private:
+ cudnn::Handle cudnnHandle;
+ TensorTransformDescriptor transDesc;
+ };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
+
+#include "pointer.hpp"
+#include "span.hpp"
+#include "tensor.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+ /** @brief maintains a single block of reusable device memory
+ *
+ * Each Workspace object is intended to be used by a single entity at a time but by
+ * different entities at different times. It maintains a single reusable block of memory which
+ * is sufficient for the largest consumer.
+ */
+ class Workspace {
+ public:
+
+ /** @brief reserve \p bytes of memory */
+ void require(std::size_t bytes) {
+ if (bytes > ptr.size())
+ ptr.reset(bytes);
+ }
+
+ /** @brief number of bytes reserved by the largest consumer */
+ std::size_t size() const noexcept {
+ return ptr.size();
+ }
+
+ /** @brief returns the pointer to the workspace memory */
+ DevicePtr<unsigned char> get() {
+ return ptr.get();
+ }
+
+ private:
+ ManagedPtr<unsigned char> ptr;
+ };
+
+ /** used to compute total workspace size from several workspace requests */
+ class WorkspaceBuilder {
+ public:
+ WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { }
+
+ /** request memory for \p count number of elements of the type \tparam T */
+ template <class T = std::int8_t>
+ void require(std::size_t count) noexcept {
+ auto blocks256 = (count * sizeof(T) + 255) / 256;
+ max_size_in_bytes += blocks256 * 256;
+ }
+
+ /** returns the total workspace memory that is required */
+ std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; }
+
+ private:
+ std::size_t max_size_in_bytes;
+ };
+
+ /** general memory block from a workspace which can be passed on to the requester */
+ class WorkspaceInstance {
+ public:
+
+ /** returns a device pointer to the workspace memory */
+ template <class T = void>
+ DevicePtr<T> get() const noexcept {
+ return static_cast<DevicePtr<T>>(ptr);
+ }
+
+ /** returnss the size of the workspace memory in bytes */
+ std::size_t size_in_bytes() const noexcept {
+ return size_in_bytes_;
+ }
+
+ /** creates a Span<T> of \p count elements from the workspace memory */
+ template <class T>
+ Span<T> get_span(std::size_t count = 0) const {
+ if (count == 0)
+ count = size_in_bytes_ / sizeof(T);
+
+ if (count * sizeof(T) > size_in_bytes_)
+ CV_Error(Error::StsNoMem, "memory not sufficient");
+
+ return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
+ }
+
+ /** creates a TensorSpan<T> of the given shape from the workspace memory */
+ template <class T, class ForwardItr>
+ TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const {
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+ auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>());
+ if (required_size * sizeof(T) > size_in_bytes_)
+ CV_Error(Error::StsNoMem, "memory not sufficient");
+ return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end);
+ }
+
+ private:
+ DevicePtr<void> ptr;
+ std::size_t size_in_bytes_;
+
+ friend class WorkspaceAllocator;
+ WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__)
+ : ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { }
+ };
+
+ /** used to split a single workspace into constituents */
+ class WorkspaceAllocator {
+ public:
+ WorkspaceAllocator() = default;
+ WorkspaceAllocator(Workspace& workspace) noexcept
+ : current{ workspace.get() }, bytes_remaining { workspace.size() }
+ {
+ CV_Assert(is_aligned<void>(current, 256));
+ CV_Assert(bytes_remaining % 256 == 0);
+ }
+
+ /** allocates a Span<T> of \p count elements from the workspace memory */
+ template <class T>
+ Span<T> get_span(std::size_t count = 0) {
+ return accquire<T>(count);
+ }
+
+ /** allocates a TensorSpan<T> of the given shape from the workspace memory */
+ template <class T, class ForwardItr>
+ TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) {
+ using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+ auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+ return TensorSpan<T>(accquire<T>(required_size).data(), start, end);
+ }
+
+ /** allocates a WorkspaceInstance of size \p bytes from the workspace memory */
+ WorkspaceInstance get_instance(std::size_t bytes = 0) {
+ auto span = accquire(bytes);
+ return WorkspaceInstance(DevicePtr<void>(span.data()), span.size());
+ }
+
+ private:
+ template <class T = std::int8_t>
+ Span<T> accquire(std::size_t count = 0) {
+ auto ptr = current;
+
+ if (count == 0)
+ count = bytes_remaining / sizeof(T);
+
+ auto blocks256 = (count * sizeof(T) + 255) / 256;
+ if (bytes_remaining < blocks256 * 256)
+ CV_Error(Error::StsNoMem, "out of workspace memory");
+
+ bytes_remaining -= blocks256 * 256;
+ current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256;
+ return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
+ }
+
+ DevicePtr<void> current;
+ std::size_t bytes_remaining;
+ };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
+
+#include <iterator>
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
+
+ namespace detail {
+ template <class T, class Tag, class = void>
+ struct is_iterator_helper : std::false_type {};
+
+ template <class T, class Tag>
+ struct is_iterator_helper<T, Tag,
+ typename std::enable_if<std::is_base_of<Tag, typename std::iterator_traits<T>::iterator_category>::value, void>::type
+ > : std::true_type {};
+ }
+
+ template <class T>
+ using is_iterator = typename detail::is_iterator_helper<T, std::input_iterator_tag>;
+
+ template <class T>
+ using is_forward_iterator = typename detail::is_iterator_helper<T, std::forward_iterator_tag>;
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
+
+#include <cstddef>
+#include <array>
+#include <cassert>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
+
+ template <class T, std::size_t maxN>
+ class resizable_static_array {
+ using container_type = std::array<T, maxN>;
+
+ public:
+ using value_type = typename container_type::value_type;
+ using size_type = typename container_type::size_type;
+ using difference_type = typename container_type::difference_type;
+ using reference = typename container_type::reference;
+ using const_reference = typename container_type::const_reference;
+ using pointer = typename container_type::pointer;
+ using const_pointer = typename container_type::const_pointer;
+ using iterator = typename container_type::iterator;
+ using const_iterator = typename container_type::const_iterator;
+ using reverse_iterator = typename container_type::reverse_iterator;
+ using const_reverse_iterator = typename container_type::const_reverse_iterator;
+
+ resizable_static_array() noexcept : size_{ 0 } { }
+ explicit resizable_static_array(size_type sz) noexcept : size_{ sz } { }
+
+ bool empty() const noexcept { return static_cast<bool>(size_); }
+ size_type size() const noexcept { return size_; }
+ size_type capacity() const noexcept { return maxN; }
+
+ void resize(size_type sz) noexcept {
+ assert(sz <= capacity());
+ size_ = sz;
+ }
+
+ void clear() noexcept { size_ = 0; }
+
+ template <class ForwardItr>
+ void assign(ForwardItr first, ForwardItr last) {
+ resize(std::distance(first, last));
+ std::copy(first, last, begin());
+ }
+
+ iterator begin() noexcept { return std::begin(arr); }
+ iterator end() noexcept { return std::begin(arr) + size(); }
+
+ const_iterator begin() const noexcept { return arr.cbegin(); }
+ const_iterator end() const noexcept { return arr.cbegin() + size(); }
+
+ const_iterator cbegin() const noexcept { return arr.cbegin(); }
+ const_iterator cend() const noexcept { return arr.cbegin() + size(); }
+
+ reverse_iterator rbegin() noexcept { return std::begin(arr) + size(); }
+ reverse_iterator rend() noexcept { return std::begin(arr); }
+
+ const_reverse_iterator rbegin() const noexcept { return arr.cbegin()+ size(); }
+ const_reverse_iterator rend() const noexcept { return arr.cbegin(); }
+
+ const_reverse_iterator crbegin() const noexcept { return arr.cbegin() + size(); }
+ const_reverse_iterator crend() const noexcept { return arr.cbegin(); }
+
+ reference operator[](size_type pos) {
+ assert(pos < size());
+ return arr[pos];
+ }
+
+ const_reference operator[](size_type pos) const {
+ assert(pos < size());
+ return arr[pos];
+ }
+
+ iterator insert(iterator pos, const T& value) {
+ resize(size() + 1);
+ std::move_backward(pos, end() - 1, end());
+ *pos = value;
+ return pos;
+ }
+
+ iterator insert(iterator pos, T&& value) {
+ resize(size() + 1);
+ std::move_backward(pos, end() - 1, end());
+ *pos = std::move(value);
+ return pos;
+ }
+
+ iterator erase(iterator pos) {
+ std::move(pos + 1, end(), pos);
+ resize(size() - 1);
+ return pos;
+ }
+
+ pointer data() noexcept { return arr.data(); }
+ const_pointer data() const noexcept { return arr.data(); }
+
+ private:
+ std::size_t size_;
+ container_type arr;
+ };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void abs(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+ template <class T>
+ void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+ template <class T>
+ void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+ template <class T>
+ void bnll(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+ template <class T>
+ void elu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+ template <class T>
+ void relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T slope);
+
+ template <class T>
+ void clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T floor, T ceiling);
+
+ template <class T>
+ void axiswise_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t inner_size, csl::View<T> slope);
+
+ template <class T>
+ void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void concat(
+ const csl::Stream& stream,
+ csl::TensorSpan<T> output, std::size_t output_axis_offset,
+ csl::TensorView<T> input, std::size_t axis);
+
+ template <class T>
+ void concat_with_offsets(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> axis_offsets);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void eltwise_max_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+ template <class T>
+ void eltwise_sum_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+ template <class T>
+ void eltwise_sum_coeff_2(const csl::Stream& stream, csl::Span<T> output, T coeff_x, csl::View<T> x, T coeff_y, csl::View<T> y);
+
+ template <class T>
+ void eltwise_prod_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void fill(const csl::Stream& stream, csl::Span<T> output, T value);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void max_pooling_with_indices(
+ const csl::Stream& stream,
+ csl::TensorSpan<T> output, csl::TensorSpan<T> indices, csl::TensorView<T> input,
+ const std::vector<std::size_t>& kernel_size, const std::vector<std::size_t>& strides,
+ const std::vector<std::size_t>& padding_left);
+
+ template <class T>
+ void max_unpooling(
+ const csl::Stream& stream,
+ csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> indices,
+ const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+ const std::vector<std::size_t>& padding_left);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void normalize(
+ const csl::Stream& stream,
+ csl::Span<T> output, csl::View<T> input,
+ std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+ csl::Span<T> workspace);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void copy_with_reflection101(
+ const csl::Stream& stream,
+ csl::TensorSpan<T> output, csl::TensorView<T> input,
+ std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void generate_prior_boxes(
+ const csl::Stream& stream,
+ csl::Span<T> output,
+ csl::View<float> boxWidth, csl::View<float> boxHeight, csl::View<float> offsetX, csl::View<float> offsetY, float stepX, float stepY,
+ std::vector<float> variance,
+ std::size_t numPriors,
+ std::size_t layerWidth, std::size_t layerHeight,
+ std::size_t imageWidth, std::size_t imageHeight,
+ bool normalize, bool clip);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void sigmoid_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset);
+
+ template <class T>
+ void softmax_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset);
+
+ template <class T>
+ void region_finalize(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
+ T object_prob_cutoff, T class_prob_cutoff,
+ std::size_t height_norm, std::size_t width_norm,
+ std::size_t rows, std::size_t cols,
+ std::size_t boxes_per_cell,
+ std::size_t box_size,
+ std::size_t classes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input);
+
+ template <class T>
+ void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void bias1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
+
+ template <class T>
+ void biasN(const csl::Stream& stream,
+ csl::TensorSpan<T> output,
+ csl::TensorView<T> input, std::size_t inner_size,
+ csl::TensorView<T> bias);
+
+ template <class T>
+ void scale1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
+
+ template <class T>
+ void scaleN(const csl::Stream& stream,
+ csl::TensorSpan<T> output,
+ csl::TensorView<T> input, std::size_t inner_size,
+ csl::TensorView<T> weights);
+
+ template <class T>
+ void scale1_with_bias1(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta);
+
+ template <class T>
+ void scaleN_with_biasN(
+ const csl::Stream& stream,
+ csl::TensorSpan<T> output,
+ csl::TensorView<T> input, std::size_t inner_size,
+ csl::TensorView<T> weights, csl::TensorView<T> bias);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+ template <class T>
+ void slice(const csl::Stream& stream,
+ csl::TensorSpan<T> output, csl::TensorView<T> input,
+ std::vector<std::size_t> offsets);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/activations.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ReLUOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ReLUOp(csl::Stream stream_, T slope_)
+ : stream(std::move(stream_)), slope{ slope_ } { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::relu<T>(stream, output, input, slope);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ const T slope;
+ };
+
+ template <class T>
+ class ClippedReLUOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ClippedReLUOp(csl::Stream stream_, T min_, T max_)
+ : stream(std::move(stream_)), min{ min_ }, max{ max_ } { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::clipped_relu<T>(stream, output, input, min, max);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ const T min, max;
+ };
+
+ template <class T>
+ class ChannelwiseReLUOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope)
+ : stream(std::move(stream_))
+ {
+ CV_Assert(!slope.empty());
+ slopeTensor = csl::makeTensorHeader<T>(slope);
+ csl::copyMatToTensor<T>(slope, slopeTensor, stream);
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ CV_Assert(input.get_axis_size(1) == slopeTensor.size());
+ std::size_t inner_size = input.size_range(2, input.rank());
+ kernels::axiswise_relu<T>(stream, output, input, inner_size, slopeTensor);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<T> slopeTensor;
+ };
+
+ template <class T>
+ class TanHOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::tanh<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+ template <class T>
+ class SigmoidOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::sigmoid<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+ template <class T>
+ class ELUOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::elu<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+ template <class T>
+ class AbsValOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::abs<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+ template <class T>
+ class BNLLOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::bnll<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+ template <class T>
+ class PowerOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_)
+ : stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::power<T>(stream, output, input, exp, scale, shift);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ const T exp, scale, shift;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class BatchNormOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias)
+ : stream(std::move(stream_))
+ {
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+
+ weightsTensor = csl::makeTensorHeader<T>(weights);
+ csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ std::size_t inner_size = input.size_range(2, input.rank());
+ kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weightsTensor, biasTensor);
+ }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<T> weightsTensor, biasTensor;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/pointer.hpp"
+
+#include "../kernels/fill.hpp"
+#include "../kernels/concat.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ConcatOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding)
+ : stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding }
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(outputs.size() == 1);
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ if(zero_padding)
+ {
+ auto output_shape = output_wrapper->getShape();
+
+ kernels::fill<T>(stream, output, 0.0);
+
+ std::size_t output_concat_axis_offset = 0;
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+ auto input_shape = input_wrapper->getShape();
+
+ std::vector<std::size_t> offsets(input_shape.size());
+ for (int j = 0; j < offsets.size(); j++)
+ offsets[j] = (output_shape[j] - input_shape[j]) / 2;
+ offsets[concat_axis] = output_concat_axis_offset;
+
+ kernels::concat_with_offsets(stream, output, input, offsets);
+
+ output_concat_axis_offset += input.get_axis_size(concat_axis);
+ }
+ }
+ else
+ {
+ std::size_t output_axis_offset = 0;
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ kernels::concat(stream, output, output_axis_offset, input, concat_axis);
+
+ output_axis_offset += input.get_axis_size(concat_axis);
+ }
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ std::size_t concat_axis;
+ bool zero_padding;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ConstOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ConstOp(csl::Stream stream_, const cv::Mat& data)
+ : stream(std::move(stream_))
+ {
+ constTensor = csl::makeTensorHeader<T>(data);
+ csl::copyMatToTensor<T>(data, constTensor, stream);
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(outputs.size() == 1 && inputs.size() == 0);
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+ csl::tensor_ops::copy<T>(stream, output, constTensor);
+ }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<T> constTensor;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ struct ConvolutionConfiguration {
+ /* the size of the following vectors must be equal to the kernel size */
+ std::vector<std::size_t> kernel_size;
+ std::vector<std::size_t> dilations, strides;
+
+ enum class PaddingMode {
+ MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+ VALID, /* no padding is added */
+ SAME /* TensorFlow logic is used for same padding */
+ };
+
+ /* explicit paddings are used if and only if padMode is set to manual */
+ PaddingMode padMode;
+ std::vector<std::size_t> pads_begin, pads_end;
+
+ /* full shape inclusive of channel and batch axis */
+ std::vector<std::size_t> input_shape;
+ std::vector<std::size_t> output_shape;
+
+ /* group count for grouped convolution */
+ std::size_t groups;
+ };
+
+ template <class T>
+ class ConvolutionOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
+ : stream(std::move(stream_)), cudnnHandle(std::move(handle))
+ {
+ const auto& kernel_size = config.kernel_size;
+ const auto& dilations = config.dilations;
+ const auto& strides = config.strides;
+
+ const auto convolution_order = kernel_size.size();
+ CV_Assert(convolution_order >= 1);
+
+ CV_Assert(convolution_order == dilations.size());
+ CV_Assert(convolution_order == strides.size());
+
+ const auto& input_shape = config.input_shape;
+ const auto& output_shape = config.output_shape;
+ CV_Assert(input_shape.size() == output_shape.size());
+ CV_Assert(input_shape.size() == convolution_order + 2);
+
+ const auto groups = config.groups;
+
+ if (convolution_order > 3)
+ CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported.");
+
+ const auto rank = input_shape.size();
+ const auto output_feature_maps = output_shape[1];
+ const auto input_feature_maps = input_shape[1];
+ const auto input_feature_maps_per_group = input_feature_maps / groups;
+ CV_Assert(input_feature_maps % groups == 0);
+
+ filtersTensor = csl::makeTensorHeader<T>(filters);
+ csl::copyMatToTensor<T>(filters, filtersTensor, stream);
+
+ if (!bias.empty())
+ {
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+ }
+
+ /* left and right are misleading as the padding is applicable for any number of dimensions
+ * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+ *
+ * `common_padding` contains the amount of padding that has to be added to both sides
+ * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+ * to a particular side in addition to the common padding
+ */
+ std::vector<std::size_t> common_padding(rank, 0);
+ std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+ if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
+ {
+ const auto& pads_begin = config.pads_begin;
+ const auto& pads_end = config.pads_end;
+
+ CV_Assert(convolution_order == pads_begin.size());
+ CV_Assert(convolution_order == pads_end.size());
+
+ for (int i = 2; i < common_padding.size(); i++)
+ {
+ common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
+ padding_left[i] = pads_begin[i - 2] - common_padding[i];
+ padding_right[i] = pads_end[i - 2] - common_padding[i];
+ }
+ }
+ else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
+ {
+ /* nothing to do as the paddings are already preset to zero */
+ }
+ else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
+ {
+ /* TensorFlow Logic:
+ * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+ *
+ * if total padding is odd, the extra is added towards the end
+ */
+ for (int i = 2; i < rank; i++)
+ {
+ const auto j = i - 2; /* filter index */
+ const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+ const auto required_total_padding =
+ std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
+
+ common_padding[i] = required_total_padding / 2;
+ padding_left[i] = 0;
+ padding_right[i] = required_total_padding % 2;
+ }
+ }
+
+ /* in some scenarios, the extra padding at the end may not change the output at all */
+ for (int i = 2; i < rank; i++) {
+ const auto j = i - 2; /* filter idx */
+ const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+ const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+ std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
+
+ /* the output shape doesn't change if we decrease the total padding by at most `rem`
+ * provided that we decrease from the right
+ */
+ if (rem && padding_right[i] > 0)
+ padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+ }
+
+ auto is_not_zero = [](std::size_t i) { return i != 0; };
+ if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+ std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+ {
+ /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
+ * copying the input to a bigger tensor and padding the ends manually
+ */
+ transformed_shape = input_shape;
+ for (int i = 0; i < rank; i++)
+ transformed_shape[i] += padding_left[i] + padding_right[i];
+
+ inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
+ }
+
+ typename csl::Convolution<T>::params_type params;
+ if (transformed_shape.empty())
+ {
+ params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ }
+ else
+ {
+ /* the convolution operation will be seeing the transformed input */
+ params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
+ }
+
+ auto& fshape = params.filter_shape;
+ fshape.resize(rank);
+ fshape[0] = output_feature_maps;
+ fshape[1] = input_feature_maps_per_group;
+ std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
+ CV_Assert(fshape.size() == kernel_size.size() + 2);
+
+ params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+ params.stride = strides;
+ params.dilation = dilations;
+ params.groups = config.groups;
+
+ convoluter = csl::Convolution<T>(cudnnHandle, params);
+
+ csl::WorkspaceBuilder builder;
+ if (!transformed_shape.empty()) {
+ auto& shape = transformed_shape;
+ auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
+ builder.require<T>(sz);
+ }
+ builder.require(convoluter.get_workspace_size());
+ scratch_mem_in_bytes = builder.required_workspace_size();
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ csl::WorkspaceAllocator allocator(workspace);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ if (!transformed_shape.empty())
+ {
+ auto& shape = transformed_shape;
+ auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
+ inputTransformer.transform(input, transformed_input);
+ input = transformed_input;
+ }
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
+ if (!biasTensor.empty())
+ {
+ std::size_t inner_size = output.size_range(2, output.rank());
+ kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
+ }
+ }
+
+ std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+ private:
+ csl::Stream stream;
+ csl::cudnn::Handle cudnnHandle;
+ csl::Tensor<T> filtersTensor, biasTensor;
+ csl::Convolution<T> convoluter;
+
+ std::vector<std::size_t> transformed_shape;
+ csl::TensorTransform<T> inputTransformer;
+
+ std::size_t scratch_mem_in_bytes;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/eltwise_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ enum class EltwiseOpType {
+ MAX,
+ SUM,
+ PRODUCT
+ };
+
+ template <class T>
+ class EltwiseOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ template <class V>
+ EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector<V> coeffs_)
+ : stream(std::move(stream_)), op{ op_ }, coeffs(std::begin(coeffs_), std::end(coeffs_))
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() >= 2);
+ CV_Assert(outputs.size() == 1);
+
+ CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM);
+ CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size());
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ if (inputs.size() == 2)
+ {
+ auto input_wrapper_x = inputs[0].dynamicCast<wrapper_type>();
+ auto input_x = input_wrapper_x->getView();
+
+ auto input_wrapper_y = inputs[1].dynamicCast<wrapper_type>();
+ auto input_y = input_wrapper_y->getView();
+
+ switch (op)
+ {
+ case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, input_x, input_y); break;
+ case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, input_x, input_y); break;
+ case EltwiseOpType::SUM:
+ if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1))
+ kernels::eltwise_sum_2<T>(stream, output, input_x, input_y);
+ else
+ kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
+ break;
+ }
+ }
+ else
+ {
+ auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
+ auto input_0 = input_wrapper_0->getView();
+
+ /* we first make a copy and then apply EltwiseOp cumulatively */
+ csl::tensor_ops::copy(stream, output, input_0);
+
+ for (int i = 1; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ switch (op)
+ {
+ case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, output, input); break;
+ case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, output, input); break;
+ case EltwiseOpType::SUM:
+ if (coeffs.empty() || coeffs[i] == 1)
+ kernels::eltwise_sum_2<T>(stream, output, output, input);
+ else
+ {
+ /* if this is the first op, we must scale output too */
+ auto coeff_x = (i == 1) ? coeffs[0] : static_cast<T>(1.0);
+ kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ EltwiseOpType op;
+ std::vector<T> coeffs;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/cublas.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class InnerProductOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ InnerProductOp(csl::Stream stream_, csl::cublas::Handle handle, std::size_t axis, const Mat& weights, const Mat& bias)
+ : stream(std::move(stream_)), cublasHandle(std::move(handle)), axis{ axis }
+ {
+ weightsTensor = csl::makeTensorHeader<T>(weights);
+ CV_Assert(get_effective_rank(weightsTensor) == 2);
+ csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+
+ if (!bias.empty())
+ {
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+ CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.size());
+ }
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ std::size_t batch_size = input.size_range(0, axis);
+
+ auto input_size = input.size() / batch_size;
+ CV_Assert(input_size == weightsTensor.get_axis_size(-1));
+
+ auto output_size = output.size() / batch_size;
+ CV_Assert(output_size == weightsTensor.get_axis_size(-2));
+
+ /* we treat the input and output as a matrix with dimensions (batch_size, input_size)
+ * and (batch_size, output_size) respectively
+ *
+ * weight matrix dimensions: (output_size, input_size)
+ *
+ * I(W^T) = O
+ * (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size)
+ */
+ input.reshape(batch_size, input_size);
+ output.reshape(batch_size, output_size);
+ csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor);
+
+ if (!biasTensor.empty())
+ kernels::biasN<T>(stream, output, output, 1, biasTensor);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ csl::cublas::Handle cublasHandle;
+ csl::Tensor<T> weightsTensor, biasTensor;
+ std::size_t axis;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ enum class LRNType {
+ ACROSS_CHANNELS,
+ WITHIN_CHANNEL
+ };
+
+ template <class T>
+ class LRNOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ LRNOp(csl::cudnn::Handle handle, LRNType type_, std::size_t local_size, T alpha, T beta, T bias, std::size_t largestInputSize)
+ : scratch_mem_in_bytes { 0 }
+ {
+ typename csl::LRN<T>::LRNType type{};
+ switch (type_) {
+ case LRNType::ACROSS_CHANNELS: type = csl::LRN<T>::LRNType::ACROSS_CHANNELS; break;
+ case LRNType::WITHIN_CHANNEL: type = csl::LRN<T>::LRNType::WITHIN_CHANNEL; break;
+ }
+ lrn = csl::LRN<T>(std::move(handle), local_size, alpha, beta, bias, type);
+
+ csl::WorkspaceBuilder builder;
+ if (type_ == LRNType::WITHIN_CHANNEL) {
+ /* this is not a bug; we require two of these */
+ builder.require<T>(largestInputSize);
+ builder.require<T>(largestInputSize);
+ }
+
+ scratch_mem_in_bytes = builder.required_workspace_size();
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::WorkspaceAllocator allocator(workspace);
+ lrn.normalize(input, output, allocator.get_instance());
+ }
+ }
+
+ std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+ private:
+ csl::LRN<T> lrn;
+ std::size_t scratch_mem_in_bytes;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/max_unpooling.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ struct MaxPoolingConfiguration {
+ /* the size of the following vectors must be equal to the pooling order */
+ std::vector<std::size_t> window_size;
+ std::vector<std::size_t> strides;
+
+ enum class PaddingMode {
+ MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+ VALID, /* no padding is added */
+ SAME /* TensorFlow logic is used for same padding */
+ };
+
+ PaddingMode padMode;
+
+ /* explicit paddings are used if and only if padMode is set to manual */
+ std::vector<std::size_t> pads_begin;
+
+ /* full shape inclusive of channel and batch axis */
+ std::vector<std::size_t> input_shape;
+ };
+
+ template <class T>
+ class MaxPoolingOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ MaxPoolingOp(csl::Stream stream_, const MaxPoolingConfiguration& config)
+ : stream(std::move(stream_))
+ {
+ window_size = config.window_size;
+
+ const auto pooling_order = window_size.size();
+ CV_Assert(pooling_order >= 1);
+
+ strides = config.strides;
+ CV_Assert(pooling_order == strides.size());
+
+ if (pooling_order != 2 && pooling_order != 3)
+ CV_Error(Error::StsNotImplemented, "Only 2D/3D max-pooling are supported.");
+
+ padding_left.resize(pooling_order);
+ if (config.padMode == MaxPoolingConfiguration::PaddingMode::MANUAL)
+ {
+ const auto& pads_begin = config.pads_begin;
+ CV_Assert(pooling_order == pads_begin.size());
+
+ padding_left.assign(std::begin(pads_begin), std::end(pads_begin));
+ }
+ else if (config.padMode == MaxPoolingConfiguration::PaddingMode::VALID)
+ {
+ /* nothing to do as the paddings are already preset to zero */
+ }
+ else if (config.padMode == MaxPoolingConfiguration::PaddingMode::SAME)
+ {
+ /* TensorFlow Logic:
+ * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+ *
+ * if total padding is odd, the extra is added towards the end
+ */
+ const auto& input_shape = config.input_shape;
+ CV_Assert(input_shape.size() == pooling_order + 2);
+
+ for (int i = 0; i < pooling_order; i++)
+ {
+ const auto output_dim = (input_shape[i + 2] - 1 + strides[i]) / strides[i];
+ const auto required_total_padding =
+ std::max<std::int64_t>(0, (output_dim - 1) * strides[i] + window_size[i] - input_shape[i + 2]);
+
+ padding_left[i] = required_total_padding / 2;
+ }
+ }
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 2);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input_data = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output_data = output_wrapper->getSpan();
+
+ auto indices_wrapper = outputs[1].dynamicCast<wrapper_type>();
+ auto output_indices = indices_wrapper->getSpan();
+
+ kernels::max_pooling_with_indices<T>(
+ stream, output_data, output_indices, input_data, window_size, strides, padding_left
+ );
+ }
+
+ private:
+ csl::Stream stream;
+
+ std::vector<std::size_t> window_size, strides, padding_left;
+ };
+
+ struct MaxUnpoolingConfiguration {
+ /* the size of the following vectors must be equal to the unpooling order */
+ std::vector<std::size_t> window_size;
+ std::vector<std::size_t> strides;
+ std::vector<std::size_t> pads_begin;
+ };
+
+ template <class T>
+ class MaxUnpoolingOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ MaxUnpoolingOp(csl::Stream stream_, const MaxUnpoolingConfiguration& config)
+ : stream(std::move(stream_))
+ {
+ window_size = config.window_size;
+
+ const auto pooling_order = window_size.size();
+ CV_Assert(pooling_order >= 1);
+
+ strides = config.strides;
+ padding_left = config.pads_begin;
+ CV_Assert(strides.size() == pooling_order);
+ CV_Assert(padding_left.size() == pooling_order);
+
+ if (pooling_order != 2 && pooling_order != 3)
+ CV_Error(Error::StsNotImplemented, "Only 2D/3D max-unpooling are supported.");
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ /* sometimes a third input is passed to provide the output shape; we won't need it */
+ CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+ CV_Assert(outputs.size() >= 1);
+
+ for(int i = 0; i < outputs.size(); i++)
+ {
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input_data = input_wrapper->getView();
+
+ auto indices_wrapper = inputs[1].dynamicCast<wrapper_type>();
+ auto input_indices = indices_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output_data = output_wrapper->getSpan();
+
+ kernels::max_unpooling<T>(stream, output_data, input_data, input_indices, window_size, strides, padding_left);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+
+ std::vector<std::size_t> window_size, strides, padding_left;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/scale_shift.hpp"
+#include "../kernels/normalize.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ struct NormalizeConfiguration {
+ std::vector<std::size_t> input_shape;
+
+ /* axis range across which values are normalized
+ *
+ * [0, axis_start) = outer range
+ * [axis_start, axis_end) = mid range
+ * [axis_end + 1, -1) = inner range
+ *
+ * for each location in the outer and inner range, all the values in the mid range are
+ * normalized together
+ */
+ std::size_t axis_start, axis_end;
+
+ /* 1 for L1 norm, 2 for L2 norm */
+ std::size_t norm;
+
+ /* epsilon to use to avoid divison by zero */
+ T eps;
+ };
+
+ template <class T>
+ class NormalizeOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ template <class V>
+ NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration<V>& config)
+ : stream(std::move(stream_)), weight{ 1.0 }
+ {
+ norm_order = config.norm;
+ epsilon = config.eps;
+ axis_start = config.axis_start;
+ axis_end = config.axis_end;
+
+ if (!weights.empty())
+ {
+ if (weights.total() == 1)
+ {
+ CV_Assert(weights.type() == CV_32F);
+ weight = weights.at<float>(0, 0);
+ }
+ else
+ {
+ weightsTensor = csl::makeTensorHeader<T>(weights);
+ csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+ }
+ }
+
+ std::size_t outer_size = 1;
+ for (int i = 0; i < axis_start; i++)
+ outer_size *= config.input_shape[i];
+
+ std::size_t inner_size = 1;
+ for (int i = axis_end; i < config.input_shape.size(); i++)
+ inner_size *= config.input_shape[i];
+
+ csl::WorkspaceBuilder builder;
+ builder.require<T>(outer_size * inner_size);
+ scratch_mem_in_bytes = builder.required_workspace_size();
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ std::size_t outer_size = input.size_range(0, axis_start);
+ std::size_t mid_size = input.size_range(axis_start, axis_end);
+ std::size_t inner_size = input.size_range(axis_end, input.rank());
+
+ auto ws_allocator = csl::WorkspaceAllocator(workspace);
+ auto scratch = ws_allocator.get_span<T>();
+ kernels::normalize<T>(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch);
+
+ /* there might be a single weight in which case `weight` will be not equal to 1.0
+ * or there might be several weights
+ * or we don't have to scale
+ */
+ if (weight != 1.0)
+ {
+ kernels::scale1<T>(stream, output, input, weight);
+ }
+ else if (!weightsTensor.empty())
+ {
+ CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */
+ CV_Assert(weightsTensor.size() == mid_size);
+ kernels::scaleN<T>(stream, output, input, inner_size, weightsTensor);
+ }
+ }
+
+ std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<T> weightsTensor;
+ T weight; /* if there is only one weight, we use this */
+
+ T epsilon;
+ std::size_t norm_order;
+ std::size_t axis_start, axis_end;
+
+ std::size_t scratch_mem_in_bytes;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/fill.hpp"
+#include "../kernels/concat.hpp"
+#include "../kernels/padding.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ enum class PaddingType {
+ CONSTANT,
+ REFLECTION101
+ };
+
+ template <class T>
+ class PaddingOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ /* `ranges` is indexed by axis and contains the range in the output where the input is copied to */
+ PaddingOp(csl::Stream stream_, PaddingType type_, T value_, std::vector<cv::Range> ranges)
+ : stream(std::move(stream_)), type{ type_ }, value{ value_ }, dstRanges(std::move(ranges))
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ auto effective_rank = get_effective_rank(input);
+ CV_Assert(get_effective_rank(input) == get_effective_rank(output));
+
+ /* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW)
+ *
+ * there could be a case where the batch axis, channel axis, and the first spatial axis are all one
+ * this would result in effective rank being less than the number of axes requiring padding
+ */
+ effective_rank = std::max(effective_rank, dstRanges.size());
+
+ for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++)
+ {
+ if (dstRanges[i] == Range::all())
+ CV_Assert(input.get_axis_size(i) == output.get_axis_size(i));
+ else
+ CV_Assert(input.get_axis_size(i) == dstRanges[i].size());
+ }
+
+ if (type == PaddingType::CONSTANT)
+ {
+ kernels::fill<T>(stream, output, value);
+
+ std::vector<std::size_t> offsets(effective_rank, 0);
+ for (int i = 0; i < dstRanges.size(); i++)
+ {
+ const auto delta = effective_rank - dstRanges.size();
+ if (dstRanges[i] != Range::all())
+ offsets[delta + i] = dstRanges[i].start;
+ }
+
+ kernels::concat_with_offsets<T>(stream, output, input, offsets);
+ }
+ else if (type == PaddingType::REFLECTION101)
+ {
+ std::vector<std::pair<std::size_t, std::size_t>> ranges(effective_rank);
+ for (int i = 0; i < effective_rank; i++)
+ {
+ const auto delta = effective_rank - dstRanges.size();
+ if (i < delta || dstRanges[i - delta] == Range::all())
+ ranges[i] = { 0, input.get_axis_size(i) };
+ else
+ ranges[i] = { dstRanges[i].start, dstRanges[i].end };
+ }
+
+ kernels::copy_with_reflection101<T>(stream, output, input, ranges);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ PaddingType type;
+ T value;
+
+ std::vector<cv::Range> dstRanges;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class PermuteOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ PermuteOp(csl::Stream stream_, std::vector<std::size_t> order_)
+ : stream(std::move(stream_)), order(std::move(order_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ auto needsPermute = [&] {
+ for (int i = 0; i < order.size(); i++)
+ if (order[i] != i)
+ return true;
+ return false;
+ }();
+
+ if (needsPermute)
+ {
+ kernels::permute(stream, output, input, order);
+ }
+ else
+ {
+ if (input.get() != output.get())
+ csl::tensor_ops::copy(stream, output, input);
+ }
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ std::vector<std::size_t> order;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ struct PoolingConfiguration {
+ enum class PoolingMode {
+ MAX,
+ AVERAGE_INCLUDE_PADDING, /* include padding while calculating average */
+ AVERAGE_EXCLUDE_PADDING /* exclude padding while calculating average */
+ };
+
+ PoolingMode poolMode;
+
+ /* the size of the following vectors must be equal to the window size */
+ std::vector<std::size_t> window_size;
+ std::vector<std::size_t> strides;
+
+ enum class PaddingMode {
+ MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+ VALID, /* no padding is added */
+ SAME /* TensorFlow logic is used for same padding */
+ };
+
+ PaddingMode padMode;
+
+ /* explicit paddings are used if and only if padMode is set to manual */
+ std::vector<std::size_t> pads_begin, pads_end;
+
+ /* the output shape is calculated using the following formula:
+ * output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1
+ *
+ * rounding mode decides what is used as `func`
+ */
+ enum class RoundingMode {
+ CEIL, /* uses ceil */
+ FLOOR
+ };
+
+ RoundingMode roundMode;
+
+ /* full shape inclusive of channel and batch axis */
+ std::vector<std::size_t> input_shape;
+ };
+
+ template <class T>
+ class PoolingOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config)
+ : cudnnHandle(std::move(handle))
+ {
+ const auto& window_size = config.window_size;
+
+ const auto pooling_order = window_size.size();
+ CV_Assert(pooling_order >= 1);
+
+ const auto& strides = config.strides;
+ CV_Assert(pooling_order == strides.size());
+
+ const auto& input_shape = config.input_shape;
+ CV_Assert(input_shape.size() == pooling_order + 2);
+
+ if (pooling_order > 3)
+ CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported.");
+
+ const auto rank = input_shape.size();
+
+ /* left and right are misleading as the padding is applicable for any number of dimensions
+ * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+ *
+ * `common_padding` contains the amount of padding that has to be added to both sides
+ * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+ * to a particular side in addition to the common padding
+ */
+ std::vector<std::size_t> common_padding(rank, 0);
+ std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+ if (config.padMode == PoolingConfiguration::PaddingMode::MANUAL)
+ {
+ const auto& pads_begin = config.pads_begin;
+ const auto& pads_end = config.pads_end;
+
+ CV_Assert(pooling_order == pads_begin.size());
+ CV_Assert(pooling_order == pads_end.size());
+
+ /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing
+ * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds
+ * the correct output size without having to deal with fancy fractional sizes
+ */
+ auto pads_end_modified = pads_end;
+ if (config.roundMode == PoolingConfiguration::RoundingMode::CEIL)
+ {
+ for (int i = 0; i < window_size.size(); i++) {
+ auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i];
+ if (rem)
+ pads_end_modified[i] += strides[i] - rem;
+ }
+ }
+
+ for (int i = 2; i < common_padding.size(); i++)
+ {
+ common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]);
+ padding_left[i] = pads_begin[i - 2] - common_padding[i];
+ padding_right[i] = pads_end_modified[i - 2] - common_padding[i];
+ }
+ }
+ else if (config.padMode == PoolingConfiguration::PaddingMode::VALID)
+ {
+ /* nothing to do as the paddings are already preset to zero */
+ }
+ else if (config.padMode == PoolingConfiguration::PaddingMode::SAME)
+ {
+ /* TensorFlow Logic:
+ * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+ *
+ * if total padding is odd, the extra is added towards the end
+ */
+ for (int i = 2; i < rank; i++)
+ {
+ const auto j = i - 2; /* filter index */
+ const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j];
+ const auto required_total_padding =
+ std::max<std::int64_t>(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]);
+
+ common_padding[i] = required_total_padding / 2;
+ padding_left[i] = 0;
+ padding_right[i] = required_total_padding % 2;
+ }
+ }
+
+ /* in some scenarios, the extra padding at the end may not change the output at all */
+ for (int i = 2; i < rank; i++) {
+ const auto j = i - 2; /* filter idx */
+ const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+ std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j];
+
+ /* the output shape doesn't change if we decrease the total padding by at most `rem`
+ * provided that we decrease from the right
+ */
+ if (rem && padding_right[i] > 0)
+ padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+ }
+
+ auto is_not_zero = [](std::size_t i) { return i != 0; };
+ if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+ std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+ {
+ /* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by
+ * copying the input to a bigger tensor and padding the ends manually
+ *
+ * But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or
+ * a bigger output shape. This effectively allows us to have arbitary padding at the right.
+ */
+ if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero))
+ {
+ /* there is padding on the left and we are forced to transform */
+ auto transformed_input_shape = input_shape;
+ for (int i = 0; i < rank; i++)
+ transformed_input_shape[i] += padding_left[i] + padding_right[i];
+
+ transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape));
+ inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
+ }
+ }
+
+ typename csl::Pooling<T>::params_type params;
+ if (transformedInput.empty())
+ {
+ /* no transform => use original input shape */
+ params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ }
+ else
+ {
+ /* the pooling operation will be seeing the transformed input */
+ auto transformed_input_shape = transformedInput.shape_as_vector();
+ params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape));
+ }
+
+ auto output_shape = input_shape;
+ for (int i = 2; i < rank; i++)
+ {
+ auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+ output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1;
+ }
+
+ params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+ params.window_size = window_size;
+ params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+ params.stride = strides;
+
+ if (config.poolMode == PoolingConfiguration::PoolingMode::MAX)
+ {
+ params.type = csl::Pooling<T>::PoolingType::MAX;
+ }
+ else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING)
+ {
+ params.type = csl::Pooling<T>::PoolingType::AVERAGE_INCLUDE_PADDING;
+ }
+ else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING)
+ {
+ params.type = csl::Pooling<T>::PoolingType::AVERAGE_EXCLUDE_PADDING;
+ }
+
+ pooler = csl::Pooling<T>(cudnnHandle, params);
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ if (!transformedInput.empty())
+ {
+ inputTransformer.transform(input, transformedInput);
+ input = csl::TensorView<T>(transformedInput);
+ }
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ pooler.pool(input, output);
+ }
+
+ private:
+ csl::cudnn::Handle cudnnHandle;
+ csl::Pooling<T> pooler;
+
+ csl::Tensor<T> transformedInput;
+ csl::TensorTransform<T> inputTransformer;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/prior_box.hpp"
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ struct PriorBoxConfiguration {
+ std::size_t feature_map_width, feature_map_height;
+ std::size_t image_width, image_height;
+
+ /* parameters for prior boxes for each feature point */
+ std::vector<float> box_widths, box_heights;
+ std::vector<float> offsets_x, offsets_y;
+ float stepX, stepY;
+
+ std::vector<float> variance;
+
+ /* number of priors per feature point */
+ std::size_t num_priors;
+
+ /* clamps the box coordinates to [0, 1] range */
+ bool clip;
+
+ /* normalizes the box coordinates using the image dimensions */
+ bool normalize;
+ };
+
+ template <class T>
+ class PriorBoxOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ PriorBoxOp(csl::Stream stream_, const PriorBoxConfiguration& config)
+ : stream(std::move(stream_))
+ {
+ feature_map_width = config.feature_map_width;
+ feature_map_height = config.feature_map_height;
+
+ image_width = config.image_width;
+ image_height = config.image_height;
+
+ const auto& box_widths = config.box_widths;
+ const auto& box_heights = config.box_heights;
+ CV_Assert(box_widths.size() == box_heights.size());
+
+ box_size = box_widths.size();
+
+ const auto& offsets_x = config.offsets_x;
+ const auto& offsets_y = config.offsets_y;
+ CV_Assert(offsets_x.size() == offsets_y.size());
+
+ offset_size = offsets_x.size();
+
+ /* for better memory utilization and preassumably better cache performance, we merge
+ * the four vectors and put them in a single tensor
+ */
+ auto total = box_widths.size() * 2 + offsets_x.size() * 2;
+ std::vector<float> merged_params;
+ merged_params.insert(std::end(merged_params), std::begin(box_widths), std::end(box_widths));
+ merged_params.insert(std::end(merged_params), std::begin(box_heights), std::end(box_heights));
+ merged_params.insert(std::end(merged_params), std::begin(offsets_x), std::end(offsets_x));
+ merged_params.insert(std::end(merged_params), std::begin(offsets_y), std::end(offsets_y));
+ CV_Assert(merged_params.size() == total);
+
+ paramsTensor.resize(total);
+ csl::memcpy(paramsTensor.get(), merged_params.data(), total, stream); /* synchronous copy */
+
+ const auto& variance_ = config.variance;
+ variance.assign(std::begin(variance_), std::end(variance_));
+
+ num_priors = config.num_priors;
+ stepX = config.stepX;
+ stepY = config.stepY;
+ clip = config.clip;
+ normalize = config.normalize;
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 2); /* we don't need the inputs but we are given */
+ CV_Assert(outputs.size() == 1);
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ /* we had stored all the parameters in a single tensor; now we create appropriate views
+ * for each of the parameter arrays from the single tensor
+ */
+ auto boxWidths = csl::View<float>(paramsTensor.get(), box_size);
+ auto boxHeights = csl::View<float>(paramsTensor.get() + box_size, box_size);
+ auto offsetsX = csl::View<float>(paramsTensor.get() + 2 * box_size, offset_size);
+ auto offsetsY = csl::View<float>(paramsTensor.get() + 2 * box_size + offset_size, offset_size);
+
+ kernels::generate_prior_boxes<T>(stream, output,
+ boxWidths, boxHeights, offsetsX, offsetsY, stepX, stepY,
+ variance, num_priors, feature_map_width, feature_map_height, image_width, image_height, normalize, clip);
+ }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<float> paramsTensor; /* widths, heights, offsetsX, offsetsY */
+
+ std::size_t feature_map_width, feature_map_height;
+ std::size_t image_width, image_height;
+
+ std::size_t box_size, offset_size;
+ float stepX, stepY;
+
+ std::vector<float> variance;
+
+ std::size_t num_priors;
+ bool clip, normalize;
+ };
+
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/region.hpp"
+
+#include "../../nms.inl.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ enum class SquashMethod {
+ SOFTMAX,
+ SIGMOID
+ };
+
+ template <class T>
+ struct RegionConfiguration {
+ /* The image is divided into (H, W) cells.
+ *
+ * Each cell is interested in exactly one object and predicts `boxes_per_cell` bounding boxes
+ * for that object.
+ *
+ * Each bounding box contains:
+ * - 4 box coordinates
+ * - objectness confidence score
+ * - `classes` number of class scores
+ *
+ * The object score is reduced to a probability using sigmoid and the class scores are reduced to
+ * probabilities by either applying sigmoid or softmax (which is a configuration option).
+ *
+ * object_prob = sigmoid(object_score)
+ * conditional_class_prob = sigmoid, softmax across all classes
+ *
+ * actual class probability = conditional_class_prob * object_prob
+ */
+
+ /* method for reducing class scores to probabilities */
+ SquashMethod squash_method;
+
+ std::size_t classes, boxes_per_cell;
+
+ std::size_t width_norm, height_norm;
+
+ /* prob cutoffs below which the prediction is nulled */
+ T object_prob_cutoff;
+ T class_prob_cutoff;
+
+ T nms_iou_threshold;
+ };
+
+ template <class T>
+ class RegionOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ template <class V>
+ RegionOp(csl::Stream stream_, const cv::Mat& bias, const RegionConfiguration<V>& config)
+ : stream(std::move(stream_))
+ {
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+
+ classes = config.classes;
+ boxes_per_cell = config.boxes_per_cell;
+
+ width_norm = config.width_norm;
+ height_norm = config.height_norm;
+
+ squash_type = config.squash_method;
+
+ object_prob_cutoff = config.object_prob_cutoff;
+ class_prob_cutoff = config.class_prob_cutoff;
+
+ nms_iou_threshold = config.nms_iou_threshold;
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::memcpy<T>(output.get(), input.get(), output.size(), stream);
+
+ auto rows = input.get_axis_size(1);
+ auto cols = input.get_axis_size(2);
+
+ auto cell_box_size = classes + 4 + 1;
+
+ /* we squash class scores into probabilities using softmax or sigmoid */
+ if (squash_type == SquashMethod::SOFTMAX)
+ kernels::softmax_strided<T>(stream, output, input, classes, cell_box_size, 5);
+ else if (squash_type == SquashMethod::SIGMOID)
+ kernels::sigmoid_strided<T>(stream, output, input, classes, cell_box_size, 5);
+
+ kernels::region_finalize<T>(stream, output, input, biasTensor, object_prob_cutoff, class_prob_cutoff,
+ height_norm, width_norm, rows, cols, boxes_per_cell, cell_box_size, classes);
+
+ if (nms_iou_threshold > 0) {
+ auto output_mat = output_wrapper->getMutableHostMat();
+ CV_Assert(output_mat.type() == CV_32F);
+ for (int i = 0; i < input.get_axis_size(0); i++) {
+ auto sample_size = rows * cols * boxes_per_cell * cell_box_size;
+ do_nms_sort(reinterpret_cast<float*>(output_mat.data) + i * sample_size, rows * cols * boxes_per_cell, class_prob_cutoff, nms_iou_threshold);
+ }
+ }
+ }
+
+ private:
+ void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh)
+ {
+ std::vector<Rect2d> boxes(total);
+ std::vector<float> scores(total);
+
+ for (int i = 0; i < total; ++i)
+ {
+ Rect2d &b = boxes[i];
+ int box_index = i * (classes + 4 + 1);
+ b.width = detections[box_index + 2];
+ b.height = detections[box_index + 3];
+ b.x = detections[box_index + 0] - b.width / 2;
+ b.y = detections[box_index + 1] - b.height / 2;
+ }
+
+ std::vector<int> indices;
+ for (int k = 0; k < classes; ++k)
+ {
+ for (int i = 0; i < total; ++i)
+ {
+ int box_index = i * (classes + 4 + 1);
+ int class_index = box_index + 5;
+ scores[i] = detections[class_index + k];
+ detections[class_index + k] = 0;
+ }
+ NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices);
+ for (int i = 0, n = indices.size(); i < n; ++i)
+ {
+ int box_index = indices[i] * (classes + 4 + 1);
+ int class_index = box_index + 5;
+ detections[class_index + k] = scores[indices[i]];
+ }
+ }
+ }
+
+ private:
+ csl::Stream stream;
+
+ csl::Tensor<T> biasTensor;
+ std::size_t classes, boxes_per_cell;
+ std::size_t width_norm, height_norm;
+ SquashMethod squash_type;
+
+ T object_prob_cutoff, class_prob_cutoff;
+ T nms_iou_threshold;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ReorgOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ReorgOp(csl::Stream stream_, std::size_t stride_)
+ : stream(std::move(stream_)), stride{ stride_ } { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ const std::size_t permute_input_shape[] = {
+ input.get_axis_size(0),
+ input.get_axis_size(1) * input.get_axis_size(2) / (stride * stride),
+ stride,
+ input.get_axis_size(3),
+ stride
+ };
+
+ constexpr std::size_t order[] = { 0, 2, 4, 1, 3 };
+
+ const std::size_t permute_output_shape[] = {
+ permute_input_shape[order[0]],
+ permute_input_shape[order[1]],
+ permute_input_shape[order[2]],
+ permute_input_shape[order[3]],
+ permute_input_shape[order[4]]
+ };
+
+ input.unsqueeze();
+ input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
+
+ output.unsqueeze();
+ output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
+
+ kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
+ }
+
+ private:
+ csl::Stream stream;
+ std::size_t stride;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ReshapeOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ReshapeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ /* sometimes the output shape is passed as extra inputs; hence, >= instead of == */
+ CV_Assert(inputs.size() >= outputs.size());
+
+ for (int i = 0; i < outputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ if (input.get() != output.get())
+ {
+ while (input.rank() < output.rank())
+ input.unsqueeze();
+
+ while (output.rank() < input.rank())
+ output.unsqueeze();
+
+ input.reshape_as(output);
+ csl::tensor_ops::copy(stream, output, input);
+ }
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/resize.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ enum class InterpolationType {
+ NEAREST_NEIGHBOUR,
+ BILINEAR
+ };
+
+ template <class T>
+ class ResizeOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ResizeOp(csl::Stream stream_, InterpolationType type_, float scaleHeight_, float scaleWidth_)
+ : stream(std::move(stream_)), type{ type_ }, scaleHeight{ scaleHeight_ }, scaleWidth{ scaleWidth_ }
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ if (type == InterpolationType::NEAREST_NEIGHBOUR)
+ kernels::resize_nn<T>(stream, output, input);
+ else if (type == InterpolationType::BILINEAR)
+ kernels::resize_bilinear<T>(stream, output, input, scaleHeight, scaleWidth);
+ }
+
+ private:
+ csl::Stream stream;
+ InterpolationType type;
+ float scaleHeight, scaleWidth; /* for bilinear interpolation */
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ScaleShiftOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ScaleShiftOp(csl::Stream stream_, std::size_t axis, const cv::Mat& weights, const cv::Mat& bias)
+ : stream(std::move(stream_)), axis{ axis }
+ {
+ if (!weights.empty())
+ {
+ weightsTensor = csl::makeTensorHeader<T>(weights);
+ csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+ }
+
+ if (!bias.empty())
+ {
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+ }
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::TensorView<T> weights;
+ if (weightsTensor.empty() && biasTensor.empty())
+ {
+ CV_Assert(inputs.size() == 2);
+
+ /* no explicit scale/shift values provided; use the second input as weights */
+ auto wrapper = inputs[1].dynamicCast<wrapper_type>();
+ weights = wrapper->getView();
+ }
+ else if (!weightsTensor.empty())
+ {
+ weights = csl::TensorSpan<T>(weightsTensor);
+ }
+
+ csl::TensorView<T> bias;
+ if (!biasTensor.empty())
+ bias = csl::TensorSpan<T>(biasTensor);
+
+ const auto numParams = !weights.empty() ? weights.size() : bias.size();
+ CV_Assert(numParams != 0);
+ if (!weightsTensor.empty() && !biasTensor.empty())
+ {
+ CV_CheckEQ(weights.size(), bias.size(), "weights and bias size are not equal");
+ }
+
+ /* the weights/bias might require broadcasting to scale/shift */
+ const int end_axis = [&] {
+ for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++)
+ {
+ std::size_t size = input.size_range(axis, endAxis);
+ if (size == numParams)
+ return endAxis;
+ }
+ CV_Assert(0 /* invalid weights matrix */);
+ }();
+
+ std::size_t inner_size = input.size_range(end_axis, input.rank());
+
+ if (!weights.empty() && !bias.empty())
+ kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weights, bias);
+ else if (!weights.empty())
+ kernels::scaleN<T>(stream, output, input, inner_size, weights);
+ else
+ kernels::biasN<T>(stream, output, input, inner_size, bias);
+ }
+
+ private:
+ csl::Stream stream;
+ csl::Tensor<T> weightsTensor, biasTensor;
+ std::size_t axis;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class ShuffleChannelOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ ShuffleChannelOp(csl::Stream stream_, std::size_t group_)
+ : stream(std::move(stream_)), group{ group_ } { }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ if (group == 1) {
+ /* permute is redundant; check else branch to know why */
+ if (input.get() != output.get()) {
+ input.reshape_as(output);
+ csl::tensor_ops::copy(stream, output, input);
+ }
+ } else {
+ const std::size_t permute_input_shape[] = {
+ input.get_axis_size(0),
+ group,
+ input.get_axis_size(1) / group,
+ input.get_axis_size(2) * input.get_axis_size(3)
+ };
+
+ constexpr std::size_t order[] = { 0, 2, 1, 3 };
+
+ const std::size_t permute_output_shape[] = {
+ permute_input_shape[order[0]],
+ permute_input_shape[order[1]],
+ permute_input_shape[order[2]],
+ permute_input_shape[order[3]],
+ };
+
+ input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
+ output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
+ kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ std::size_t group;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/slice.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class SliceOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ /* offsets is indexed by output number and each subvector is indexed by axis number */
+ SliceOp(csl::Stream stream_, std::vector<std::vector<std::size_t>> offsets)
+ : stream(std::move(stream_)), offsets(std::move(offsets))
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ /* sometimes the output shape is passed in the form of a second input tensor
+ * it's only required for initialization and not here
+ */
+ CV_Assert(inputs.size() == 1 || inputs.size() == 2);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ for (int i = 0; i < outputs.size(); ++i)
+ {
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ kernels::slice<T>(stream, output, input, offsets[i]);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ std::vector<std::vector<std::size_t>> offsets;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class SoftmaxOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ SoftmaxOp(csl::cudnn::Handle handle, std::size_t axis_, bool log_)
+ : cudnnHandle(std::move(handle)), channel_axis{ axis_ }, log{ log_ }
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ for (int i = 0; i < inputs.size(); i++)
+ {
+ auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::tensor_ops::softmax<T>(cudnnHandle, output, input, channel_axis, log);
+ }
+ }
+
+ private:
+ csl::cudnn::Handle cudnnHandle;
+ std::size_t channel_axis;
+ bool log;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ template <class T>
+ class SplitOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ SplitOp(csl::Stream stream_)
+ : stream(std::move(stream_))
+ {
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ for (int i = 0; i < outputs.size(); i++)
+ {
+ auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::tensor_ops::copy<T>(stream, output, input);
+ }
+ }
+
+ private:
+ csl::Stream stream;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP */
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+ struct TransposeConvolutionConfiguration {
+ /* other than `input_shape` and `output_shape`, all the configuration values must be provided
+ * for the corresponding convolution operation (not transpose convolution)
+ */
+
+ /* the size of the following vectors must be equal to the kernel size */
+ std::vector<std::size_t> kernel_size;
+ std::vector<std::size_t> dilations, strides;
+
+ enum class PaddingMode {
+ MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+ VALID, /* no padding is added */
+ SAME /* TensorFlow logic is used for same padding */
+ };
+
+ /* explicit paddings are used if and only if padMode is set to manual */
+ PaddingMode padMode;
+ std::vector<std::size_t> pads_begin, pads_end;
+
+ /* full shape inclusive of channel and batch axis */
+ std::vector<std::size_t> input_shape;
+ std::vector<std::size_t> output_shape;
+
+ /* group count for grouped convolution */
+ std::size_t groups;
+ };
+
+ template <class T>
+ class TransposeConvolutionOp final : public CUDABackendNode {
+ public:
+ using wrapper_type = GetCUDABackendWrapperType<T>;
+
+ TransposeConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const TransposeConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
+ : stream(std::move(stream_)), cudnnHandle(std::move(handle))
+ {
+ /* we make use of backward pass of convolution to perform forward pass of transpose convolution
+ * hence, we must setup configuration for the convolution operation and perform backward pass
+ */
+ const auto& kernel_size = config.kernel_size;
+ const auto& dilations = config.dilations;
+ const auto& strides = config.strides;
+
+ const auto convolution_order = kernel_size.size();
+ CV_Assert(convolution_order >= 1);
+
+ CV_Assert(convolution_order == dilations.size());
+ CV_Assert(convolution_order == strides.size());
+
+ const auto& input_shape = config.input_shape;
+ const auto& output_shape = config.output_shape;
+ CV_Assert(input_shape.size() == output_shape.size());
+ CV_Assert(input_shape.size() == convolution_order + 2);
+
+ const auto groups = config.groups;
+
+ if (convolution_order > 3)
+ CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D transpose convolution is supported.");
+
+ const auto rank = input_shape.size();
+ const auto input_feature_maps = input_shape[1];
+ const auto output_feature_maps = output_shape[1];
+ const auto output_feature_maps_per_group = output_feature_maps / groups;
+ CV_Assert(output_feature_maps % groups == 0);
+
+ filtersTensor = csl::makeTensorHeader<T>(filters);
+ csl::copyMatToTensor<T>(filters, filtersTensor, stream);
+
+ if (!bias.empty())
+ {
+ CV_Assert(bias.total() == output_feature_maps);
+ biasTensor = csl::makeTensorHeader<T>(bias);
+ csl::copyMatToTensor<T>(bias, biasTensor, stream);
+ }
+
+ /* left and right are misleading as the padding is applicable for any number of dimensions
+ * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+ *
+ * `common_padding` contains the amount of padding that has to be added to both sides
+ * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+ * to a particular side in addition to the common padding
+ *
+ * note that we compute the padding for the convolution operation
+ */
+ std::vector<std::size_t> common_padding(rank, 0);
+ std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+ if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::MANUAL)
+ {
+ const auto& pads_begin = config.pads_begin;
+ const auto& pads_end = config.pads_end;
+
+ CV_Assert(convolution_order == pads_begin.size());
+ CV_Assert(convolution_order == pads_end.size());
+
+ for (int i = 2; i < common_padding.size(); i++)
+ {
+ common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
+ padding_left[i] = pads_begin[i - 2] - common_padding[i];
+ padding_right[i] = pads_end[i - 2] - common_padding[i];
+ }
+ }
+ else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::VALID)
+ {
+ /* nothing to do as the paddings are already preset to zero */
+ }
+ else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::SAME)
+ {
+ /* TensorFlow Logic:
+ * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+ *
+ * if total padding is odd, the extra is added towards the end
+ */
+ for (int i = 2; i < rank; i++)
+ {
+ const auto j = i - 2; /* filter index */
+ const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+ const auto required_total_padding =
+ std::max<std::int64_t>(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]);
+
+ common_padding[i] = required_total_padding / 2;
+ padding_left[i] = 0;
+ padding_right[i] = required_total_padding % 2;
+ }
+ }
+
+ /* in some scenarios, the extra padding at the end may not change the output at all */
+ for (int i = 2; i < rank; i++) {
+ const auto j = i - 2; /* filter idx */
+ const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+ const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+ std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
+
+ /* the output shape doesn't change if we decrease the total padding by at most `rem`
+ * provided that we decrease from the right
+ */
+ if (rem && padding_right[i] > 0)
+ padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+ }
+
+ auto is_not_zero = [](std::size_t i) { return i != 0; };
+ if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+ std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+ {
+ CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported.");
+ }
+
+ typename csl::TransposeConvolution<T>::params_type params;
+ params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+
+ auto& fshape = params.filter_shape;
+ fshape.resize(rank);
+ fshape[0] = input_feature_maps;
+ fshape[1] = output_feature_maps_per_group;
+ std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
+ CV_Assert(fshape.size() == kernel_size.size() + 2);
+
+ params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+ params.stride = strides;
+ params.dilation = dilations;
+ params.groups = config.groups;
+
+ convoluter = csl::TransposeConvolution<T>(cudnnHandle, params);
+
+ csl::WorkspaceBuilder builder;
+ builder.require(convoluter.get_workspace_size());
+ scratch_mem_in_bytes = builder.required_workspace_size();
+ }
+
+ void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ csl::Workspace& workspace) override
+ {
+ CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+ auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+ auto input = input_wrapper->getView();
+
+ auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+ auto output = output_wrapper->getSpan();
+
+ csl::WorkspaceAllocator allocator(workspace);
+ convoluter.transpose_convolve(output, input, filtersTensor, allocator.get_instance());
+ if (!biasTensor.empty())
+ {
+ std::size_t inner_size = total(output_wrapper->getShape(), 2, -1);
+ kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
+ }
+ }
+
+ std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+ private:
+ csl::Stream stream;
+ csl::cudnn::Handle cudnnHandle;
+ csl::Tensor<T> filtersTensor, biasTensor;
+ csl::TransposeConvolution<T> convoluter;
+
+ std::size_t scratch_mem_in_bytes;
+ };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP */
+++ /dev/null
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// this file is a stub and will be removed once actual code is added
-
-#include "../precomp.hpp"
-
-#ifndef HAVE_CUDA
-# error "CUDA4DNN should be enabled iff CUDA and cuDNN were found"
-#endif
-
-#include <cudnn.h>
-
-void cuda4dnn_build_test_func() {
- auto ver = cudnnGetVersion();
- CV_UNUSED(ver);
-}
#include "op_halide.hpp"
#include "op_inf_engine.hpp"
#include "op_vkcom.hpp"
+#include "op_cuda.hpp"
#include "halide_scheduler.hpp"
+
#include <set>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <iterator>
#include <numeric>
+#include <memory>
#include <opencv2/dnn/shape_utils.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core/utils/configuration.private.hpp>
#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/cuda.hpp>
+
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
if (haveVulkan())
backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
#endif
+
+#ifdef HAVE_CUDA
+ if (haveCUDA()) {
+ backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
+ backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
+ }
+#endif
}
static inline bool checkIETarget(int target)
{
return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
#endif // HAVE_VULKAN
}
+ else if (backendId == DNN_BACKEND_CUDA)
+ {
+ CV_Assert(haveCUDA());
+
+#ifdef HAVE_CUDA
+ switch (targetId)
+ {
+ case DNN_TARGET_CUDA:
+ return CUDABackendWrapperFP32::create(m);
+ case DNN_TARGET_CUDA_FP16:
+ return CUDABackendWrapperFP16::create(m);
+ default:
+ CV_Assert(IS_DNN_CUDA_TARGET(targetId));
+ }
+#endif
+ }
else
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
return Ptr<BackendWrapper>();
preferableBackend = DNN_BACKEND_DEFAULT;
preferableTarget = DNN_TARGET_CPU;
skipInfEngineInit = false;
+
+#ifdef HAVE_CUDA
+ if (cv::cuda::getCudaEnabledDeviceCount() > 0)
+ {
+ cuda4dnn::csl::CSLContext context;
+ context.stream = cuda4dnn::csl::Stream(true);
+ context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
+ context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
+
+ cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context)));
+ }
+#endif
}
Ptr<DataLayer> netInputLayer;
std::vector<int64> layersTimings;
Mat output_blob;
+#ifdef HAVE_CUDA
+ struct CudaInfo_t
+ {
+ CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { }
+ cuda4dnn::csl::CSLContext context;
+ cuda4dnn::csl::Workspace workspace;
+ };
+
+ std::unique_ptr<CudaInfo_t> cudaInfo;
+#endif
+
Ptr<BackendWrapper> wrap(Mat& host)
{
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
#endif
}
+ else if (preferableBackend == DNN_BACKEND_CUDA)
+ {
+ CV_Assert(haveCUDA());
+#ifdef HAVE_CUDA
+ switch (preferableTarget)
+ {
+ case DNN_TARGET_CUDA:
+ return CUDABackendWrapperFP32::create(baseBuffer, shape);
+ case DNN_TARGET_CUDA_FP16:
+ return CUDABackendWrapperFP16::create(baseBuffer, shape);
+ default:
+ CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
+ }
+#endif
+ }
else
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
}
preferableTarget == DNN_TARGET_FPGA);
CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
preferableTarget == DNN_TARGET_VULKAN);
+ CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
+ IS_DNN_CUDA_TARGET(preferableTarget));
+
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
{
if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
preferableTarget = DNN_TARGET_CPU;
}
+ if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA())
+ {
+#ifdef HAVE_CUDA
+ CV_LOG_WARNING(NULL, "unable to use CUDA backend; switching to CPU");
+#else
+ CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU");
+#endif
+ preferableBackend = DNN_BACKEND_OPENCV;
+ preferableTarget = DNN_TARGET_CPU;
+ }
+
clear();
allocateLayers(blobsToKeep_);
initBackend();
- if (!netWasAllocated )
+ if (!netWasAllocated)
{
#ifdef HAVE_HALIDE
if (preferableBackend == DNN_BACKEND_HALIDE)
initInfEngineBackend();
else if (preferableBackend == DNN_BACKEND_VKCOM)
initVkComBackend();
+ else if (preferableBackend == DNN_BACKEND_CUDA)
+ initCUDABackend();
else
CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
}
#endif // HAVE_INF_ENGINE
}
+ void initCUDABackend() {
+ CV_Assert(haveCUDA());
+
+#ifdef HAVE_CUDA
+ for (auto& layer : layers)
+ {
+ auto& ld = layer.second;
+ auto& layerInstance = ld.layerInstance;
+
+ if (!layerInstance->supportBackend(DNN_BACKEND_CUDA))
+ {
+ std::ostringstream os;
+ os << "CUDA backend will fallback to the CPU implementation for the layer \"" << ld.name
+ << "\" of type " << ld.type << '\n';
+ CV_LOG_INFO(NULL, os.str().c_str());
+ continue;
+ }
+
+ /* we make a copy so that `initCUDA` doesn't modify `cudaInfo->context` */
+ auto context = cudaInfo->context;
+ auto node = layerInstance->initCUDA(&context, ld.inputBlobsWrappers, ld.outputBlobsWrappers);
+ ld.backendNodes[DNN_BACKEND_CUDA] = node;
+
+ auto cudaNode = node.dynamicCast<CUDABackendNode>();
+ cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
+ }
+#endif
+ }
+
void allocateLayer(int lid, const LayersShapesMap& layersShapes)
{
CV_TRACE_FUNCTION();
for (size_t i = 0; i < ninputs; i++)
{
ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
+#ifdef HAVE_CUDA
+ if (IS_DNN_CUDA_TARGET(preferableTarget))
+ {
+ auto wrapper = ld.inputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
+ wrapper->setStream(cudaInfo->context.stream);
+ }
+#endif
}
}
else
for (int i = 0; i < ld.outputBlobs.size(); ++i)
{
ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
+#ifdef HAVE_CUDA
+ if (IS_DNN_CUDA_TARGET(preferableTarget))
+ {
+ auto wrapper = ld.outputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
+ wrapper->setStream(cudaInfo->context.stream);
+ }
+#endif
}
- ld.internalBlobsWrappers.resize(ld.internals.size());
- for (int i = 0; i < ld.internals.size(); ++i)
+
+ /* CUDA backend has its own system for internal blobs; we don't need these */
+ ld.internalBlobsWrappers.resize((preferableBackend == DNN_BACKEND_CUDA) ? 0 : ld.internals.size());
+ for (int i = 0; i < ld.internalBlobsWrappers.size(); ++i)
{
ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
}
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{
if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
+ preferableBackend != DNN_BACKEND_CUDA &&
preferableBackend != DNN_BACKEND_INFERENCE_ENGINE))
return;
blobManager.reset();
backendWrappers.clear();
+
+ for(auto& layer : layers)
+ {
+ auto& ld = layer.second;
+ ld.inputBlobsWrappers.clear();
+ ld.outputBlobsWrappers.clear();
+ ld.internalBlobsWrappers.clear();
+ }
+
// Fake references to input blobs.
for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
blobManager.addReference(LayerPin(0, i));
{
Ptr<BackendNode> node = it->second;
CV_Assert(!node.empty());
- if (preferableBackend == DNN_BACKEND_HALIDE)
+ if (preferableBackend == DNN_BACKEND_CUDA)
+ {
+ CV_Assert(haveCUDA());
+
+#ifdef HAVE_CUDA
+ Ptr<CUDABackendNode> cudaNode = node.dynamicCast<CUDABackendNode>();
+ CV_Assert(!cudaNode.empty());
+
+ cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
+#endif
+ }
+ else if (preferableBackend == DNN_BACKEND_HALIDE)
{
forwardHalide(ld.outputBlobsWrappers, node);
}
//forward itself
forwardLayer(ld);
+
+#ifdef HAVE_CUDA
+ if (preferableBackend == DNN_BACKEND_CUDA)
+ cudaInfo->context.stream.synchronize();
+#endif
}
void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
prevNode = itBackend->second;
}
}
- String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462"};
+ String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848"};
String backend;
switch (prefBackend) {
case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
case DNN_BACKEND_INFERENCE_ENGINE: backend = "DLIE/"; break;
case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
+ case DNN_BACKEND_CUDA: backend = "CUDA/"; break;
}
out << "digraph G {" << '\n';
// Add nodes
case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16\\n"; colorId = 2; break;
case DNN_TARGET_MYRIAD: out << "MYRIAD\\n"; colorId = 3; break;
case DNN_TARGET_FPGA: out << "FPGA\\n"; colorId = 4; break;
+ case DNN_TARGET_CUDA: out << "CUDA\\n"; colorId = 5; break;
+ case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16\\n"; colorId = 6; break;
}
out << ((skipId.size() == 1)? "\" " : " }\" ");
out << "fillcolor=\"" << colors[colorId] << "\" ";
return backendId == DNN_BACKEND_OPENCV;
}
+Ptr<BackendNode> Layer::initCUDA(
+ void*,
+ const std::vector<Ptr<BackendWrapper>>&,
+ const std::vector<Ptr<BackendWrapper>>&)
+{
+ CV_Error(Error::StsNotImplemented, "CUDA pipeline of " + type +
+ " layers is not defined.");
+ return Ptr<BackendNode>();
+}
+
Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
{
CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/batch_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return (backendId == DNN_BACKEND_OPENCV) ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::BatchNormOp>(preferableTarget, std::move(context->stream), weights_, bias_);
+ }
+#endif
+
virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
{
switch (node->backendId)
//
//M*/
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine());
}
inputs[i].copyTo(outputs[i]);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/concat.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) || // By channels
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !padding) ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
}
}
}
+
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto concat_axis = clamp(axis, input_wrapper->getRank());
+ return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
#include "../precomp.hpp"
#include "../op_inf_engine.hpp"
+#include "../op_cuda.hpp"
#include "layers_common.hpp"
#ifdef HAVE_OPENCL
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/const.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv { namespace dnn {
class ConstLayerImpl CV_FINAL : public ConstLayer
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_INFERENCE_ENGINE ||
+ backendId == DNN_BACKEND_CUDA;
}
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
}
#endif // HAVE_INF_ENGINE
+
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ CV_Assert(blobs.size() == 1);
+ return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+ }
+#endif
+
};
Ptr<Layer> ConstLayer::create(const LayerParams& params)
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
using namespace cv::dnn::ocl4dnn;
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/convolution.hpp"
+#include "../cuda4dnn/primitives/transpose_convolution.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
+ if (backendId == DNN_BACKEND_CUDA)
+ {
+ /* only convolution 2d and 3d supported */
+ if(kernel_size.size() == 2 || kernel_size.size() == 3)
+ return true;
+
+ return false;
+ }
+
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
{
return Ptr<BackendNode>();
}
-
-
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ CV_Assert(inputs.size() == 1);
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+
+ CV_Assert(outputs.size() == 1);
+ auto output_wrapper = outputs[0].dynamicCast<CUDABackendWrapper>();
+ auto output_shape = output_wrapper->getShape();
+
+ const auto output_feature_maps = blobs[0].size[0];
+ const auto input_feature_maps = input_shape[1];
+ const auto input_feature_maps_per_group = blobs[0].size[1];
+ const auto groups = input_feature_maps / input_feature_maps_per_group;
+
+ ConvolutionConfiguration config;
+ config.kernel_size.assign(std::begin(kernel_size), std::end(kernel_size));
+ config.dilations.assign(std::begin(dilations), std::end(dilations));
+ config.strides.assign(std::begin(strides), std::end(strides));
+
+ if (padMode.empty())
+ {
+ config.padMode = ConvolutionConfiguration::PaddingMode::MANUAL;
+ config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
+ config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
+ }
+ else if (padMode == "VALID")
+ {
+ config.padMode = ConvolutionConfiguration::PaddingMode::VALID;
+ }
+ else if (padMode == "SAME")
+ {
+ config.padMode = ConvolutionConfiguration::PaddingMode::SAME;
+ }
+ else
+ {
+ CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by ConvolutionLayer");
+ }
+
+ config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ config.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+ config.groups = groups;
+
+ Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
+ Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();
+ if (countNonZero(biasMat) == 0)
+ biasMat = Mat();
+
+ return make_cuda_node<cuda4dnn::ConvolutionOp>(
+ preferableTarget, std::move(context->stream), std::move(context->cudnn_handle), config, filtersMat, biasMat);
+ }
+#endif
+
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
+ if (backendId == DNN_BACKEND_CUDA)
+ {
+ /* only deconvolution 2d and 3d supported */
+ if (kernel_size.size() == 2 || kernel_size.size() == 3)
+ return true;
+
+ return false;
+ }
+
#ifdef HAVE_INF_ENGINE
const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW or IODHW layout
const int group = numOutput / outGroupCn;
}
else
#endif // HAVE_INF_ENGINE
- return kernel_size.size() == 2 && (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE);
+ return backendId == DNN_BACKEND_CUDA ||
+ (kernel_size.size() == 2 && (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE));
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ CV_Assert(inputs.size() == 1);
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+
+ CV_Assert(outputs.size() == 1);
+ auto output_wrapper = outputs[0].dynamicCast<CUDABackendWrapper>();
+ auto output_shape = output_wrapper->getShape();
+
+ const auto output_feature_maps = numOutput;
+ const auto output_feature_maps_per_group = blobs[0].size[1];
+ const auto groups = output_feature_maps / output_feature_maps_per_group;
+
+ TransposeConvolutionConfiguration config;
+ config.kernel_size.assign(std::begin(kernel_size), std::end(kernel_size));
+ config.dilations.assign(std::begin(dilations), std::end(dilations));
+ config.strides.assign(std::begin(strides), std::end(strides));
+
+ if (padMode.empty())
+ {
+ config.padMode = TransposeConvolutionConfiguration::PaddingMode::MANUAL;
+ config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
+ config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
+ }
+ else if (padMode == "VALID")
+ {
+ config.padMode = TransposeConvolutionConfiguration::PaddingMode::VALID;
+ }
+ else if (padMode == "SAME")
+ {
+ config.padMode = TransposeConvolutionConfiguration::PaddingMode::SAME;
+ }
+ else
+ {
+ CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by DeconvolutionLayer");
+ }
+
+ config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ config.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+ config.groups = groups;
+
+ CV_Assert(blobs.size() >= 1);
+ Mat filtersMat = fusedWeights ? weightsMat.t() : blobs[0];
+
+ Mat biasMat = (hasBias() || fusedBias) ? biasesMat : Mat();
+ if (countNonZero(biasMat) == 0)
+ biasMat = Mat();
+
+ return make_cuda_node<cuda4dnn::TransposeConvolutionOp>(
+ preferableTarget, std::move(context->stream), std::move(context->cudnn_handle), config, filtersMat, biasMat);
+ }
+#endif
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/activation.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
func.apply(src, dst, len, planeSize, cn0, cn1);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return func.initCUDA(Layer::preferableTarget, context->stream);
+ }
+#endif
+
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return slope >= 0 || !INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2019R1);
#endif
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_VKCOM;
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::ReLUOp>(target, stream, slope);
+ }
+#endif
+
#ifdef HAVE_OPENCL
bool initKernel(ocl::Kernel &ker, const UMat &src) const
{
}
#endif // HAVE_VULKAN
-
-
bool tryFuse(Ptr<dnn::Layer>&) { return false; }
void getScaleShift(Mat&, Mat&) const {}
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::ClippedReLUOp>(target, stream, minValue, maxValue);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::TanHOp>(target, stream);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::SigmoidOp>(target, stream);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::ELUOp>(target, stream);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return !INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2019R1);
#endif
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::AbsValOp>(target, stream);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::BNLLOp>(target, stream);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return (targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16) || power == 1.0 || power == 0.5;
else
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::PowerOp>(target, stream, power, scale, shift);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
bool supportBackend(int backendId, int)
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
}
#endif
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+ {
+ return make_cuda_node<cuda4dnn::ChannelwiseReLUOp>(target, stream, scale);
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/eltwise.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
backendId == DNN_BACKEND_HALIDE ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE &&
(preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
coeffs, op, activ.get(), nstripes);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto op_ = [this] {
+ switch (op) {
+ case MAX: return cuda4dnn::EltwiseOpType::MAX;
+ case SUM: return cuda4dnn::EltwiseOpType::SUM;
+ case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
+ }
+ return cuda4dnn::EltwiseOpType::SUM;
+ }();
+
+ return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
+ }
+#endif
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include <float.h>
#include <algorithm>
#include <opencv2/dnn/shape_utils.hpp>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine());
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/dnn/shape_utils.hpp>
using namespace cv::dnn::ocl4dnn;
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/inner_product.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1);
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+
+ auto flatten_start_axis = clamp(axis, input_wrapper->getRank());
+
+ auto biasMat_ = bias ? biasMat : Mat();
+ return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, weightsMat, biasMat_);
+ }
+#endif
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
using namespace cv::dnn::ocl4dnn;
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/lrn.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return bias == (int)bias;
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
backendId == DNN_BACKEND_HALIDE ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM));
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ cuda4dnn::LRNType type_;
+ if (type == CHANNEL_NRM)
+ type_ = cuda4dnn::LRNType::ACROSS_CHANNELS;
+ else if (type == SPATIAL_NRM)
+ type_ = cuda4dnn::LRNType::WITHIN_CHANNEL;
+ else
+ CV_Error(Error::StsNotImplemented, "Unknown normalization region");
+
+ float alphaSize = alpha;
+ if (!normBySize) {
+ switch (type) {
+ case CHANNEL_NRM: alphaSize = alpha * size; break;
+ case SPATIAL_NRM: alphaSize = alpha * size * size; break;
+ }
+ }
+
+ std::size_t largestInputSize = 0;
+ for(auto& wrapper : inputs) {
+ auto input_wrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+ auto shape = input_wrapper->getShape();
+ largestInputSize = std::max<std::size_t>(
+ largestInputSize,
+ std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<int>())
+ );
+ }
+
+ return make_cuda_node<cuda4dnn::LRNOp>(preferableTarget,
+ std::move(context->cudnn_handle), type_, size, alphaSize, beta, bias, largestInputSize);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include <opencv2/dnn/shape_utils.hpp>
-#include <iostream>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/max_unpooling.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
namespace cv
{
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide() && !poolPad.width && !poolPad.height);
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ cuda4dnn::MaxUnpoolingConfiguration config;
+ auto& window_size = config.window_size;
+ window_size.resize(2);
+ window_size[0] = poolKernel.height;
+ window_size[1] = poolKernel.width;
+
+ auto& strides = config.strides;
+ strides.resize(2);
+ strides[0] = poolStride.height;
+ strides[1] = poolStride.width;
+
+ auto& pads_begin = config.pads_begin;
+ pads_begin.resize(2);
+ pads_begin[0] = poolPad.height;
+ pads_begin[1] = poolPad.width;
+
+ return make_cuda_node<cuda4dnn::MaxUnpoolingOp>(preferableTarget, std::move(context->stream), config);
+ }
+#endif
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/normalize_bbox.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv { namespace dnn {
class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
return preferableTarget == DNN_TARGET_MYRIAD ? !acrossSpatial : startAxis == 1;
}
- return backendId == DNN_BACKEND_OPENCV;
+ return backendId == DNN_BACKEND_OPENCV ||
+ (backendId == DNN_BACKEND_CUDA && (pnorm == 1 || pnorm == 2));
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ if(pnorm != 1 && pnorm != 2)
+ CV_Error(Error::StsNotImplemented, "Unsupported normalization mode");
+
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+
+ NormalizeConfiguration<float> config;
+ config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+ config.axis_start = clamp(startAxis, input_shape.size());
+ config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
+ config.norm = pnorm;
+ config.eps = epsilon;
+
+ const auto& weightsMat = blobs.empty() ? Mat() : blobs[0];
+ return make_cuda_node<cuda4dnn::NormalizeOp>(preferableTarget, std::move(context->stream), weightsMat, config);
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include <vector>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/padding.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
(dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0));
#endif
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide() && dstRanges.size() == 4);
}
CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ cuda4dnn::PaddingType ptype;
+ if (paddingType == "constant")
+ ptype = PaddingType::CONSTANT;
+ else if (paddingType == "reflect")
+ ptype = PaddingType::REFLECTION101;
+ else
+ CV_Error(Error::StsNotImplemented, "Unsupported padding mode");
+
+ return make_cuda_node<cuda4dnn::PaddingOp>(preferableTarget, std::move(context->stream), ptype, paddingValue, dstRanges);
+ }
+#endif
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_HALIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
#include <float.h>
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/permute.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine()) ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan());
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::PermuteOp>(preferableTarget, std::move(context->stream), _order);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
using namespace cv::dnn::ocl4dnn;
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/pooling.hpp"
+#include "../cuda4dnn/primitives/max_unpooling.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
- if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+ if (backendId == DNN_BACKEND_CUDA)
+ {
+ return type == MAX || type == AVE;
+ }
+ else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
{
if (computeMaxIdx)
return false;
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+
+ /* storing max indices is a special case and we deal with it separately */
+ if (computeMaxIdx) {
+ CV_Assert(type == MAX);
+
+ cuda4dnn::MaxPoolingConfiguration config;
+ config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
+ config.strides.assign(std::begin(strides), std::end(strides));
+
+ if (padMode.empty())
+ {
+ config.padMode = MaxPoolingConfiguration::PaddingMode::MANUAL;
+ config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
+ }
+ else if (padMode == "VALID")
+ {
+ config.padMode = MaxPoolingConfiguration::PaddingMode::VALID;
+ }
+ else if (padMode == "SAME")
+ {
+ config.padMode = MaxPoolingConfiguration::PaddingMode::SAME;
+ }
+ else
+ {
+ CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
+ }
+
+ config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+
+ return make_cuda_node<cuda4dnn::MaxPoolingOp>(preferableTarget, std::move(context->stream), config);
+ }
+
+ PoolingConfiguration config;
+ if (type == MAX)
+ {
+ config.poolMode = PoolingConfiguration::PoolingMode::MAX;
+ }
+ else if (type == AVE && !avePoolPaddedArea)
+ {
+ config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING;
+ }
+ else if (type == AVE && avePoolPaddedArea)
+ {
+ config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING;
+ }
+ else
+ {
+ CV_Error(Error::StsNotImplemented, "Unsupported pooling mode");
+ }
+
+ config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
+ config.strides.assign(std::begin(strides), std::end(strides));
+
+ if (padMode.empty())
+ {
+ config.padMode = PoolingConfiguration::PaddingMode::MANUAL;
+ config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
+ config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
+ }
+ else if (padMode == "VALID")
+ {
+ config.padMode = PoolingConfiguration::PaddingMode::VALID;
+ }
+ else if (padMode == "SAME")
+ {
+ config.padMode = PoolingConfiguration::PaddingMode::SAME;
+ }
+ else
+ {
+ CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
+ }
+
+ if (ceilMode)
+ config.roundMode = PoolingConfiguration::RoundingMode::CEIL;
+ else
+ config.roundMode = PoolingConfiguration::RoundingMode::FLOOR;
+
+ config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+
+ return make_cuda_node<cuda4dnn::PoolingOp>(preferableTarget, std::move(context->cudnn_handle), config);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
#include <float.h>
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/prior_box.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() &&
( _explicitSizes || (_minSize.size() == 1 && _maxSize.size() <= 1)))
|| (backendId == DNN_BACKEND_VKCOM && haveVulkan());
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto feature_map_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto feature_map_shape = feature_map_wrapper->getShape();
+
+ auto image_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+ auto image_shape = image_wrapper->getShape();
+
+ PriorBoxConfiguration config;
+ config.feature_map_width = feature_map_shape.rbegin()[0];
+ config.feature_map_height = feature_map_shape.rbegin()[1];
+ config.image_width = image_shape.rbegin()[0];
+ config.image_height = image_shape.rbegin()[1];
+
+ config.num_priors = _numPriors;
+ config.box_widths = _boxWidths;
+ config.box_heights = _boxHeights;
+ config.offsets_x = _offsetsX;
+ config.offsets_y = _offsetsY;
+ config.stepX = _stepX;
+ config.stepY = _stepY;
+
+ config.variance = _variance;
+
+ config.clip = _clip;
+ config.normalize = _bboxesNormalized;
+
+ return make_cuda_node<cuda4dnn::PriorBoxOp>(preferableTarget, std::move(context->stream), config);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
//M*/
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include "../nms.inl.hpp"
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/region.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
}
+ virtual bool supportBackend(int backendId) CV_OVERRIDE
+ {
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA;
+ }
+
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ if (coords != 4)
+ CV_Error(Error::StsNotImplemented, "Only upright rectangular boxes are supported in RegionLayer.");
+
+ std::size_t height_norm, width_norm;
+ if (inputs.size() == 1)
+ {
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+ height_norm = input_shape[1];
+ width_norm = input_shape[2];
+ }
+ else
+ {
+ auto input_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+ auto input_shape = input_wrapper->getShape();
+ CV_Assert(input_shape.size() == 4);
+ height_norm = input_shape[2];
+ width_norm = input_shape[3];
+ }
+
+ cuda4dnn::SquashMethod squash_method;
+ if(useLogistic)
+ squash_method = cuda4dnn::SquashMethod::SIGMOID;
+ else if (useSoftmax)
+ squash_method = cuda4dnn::SquashMethod::SOFTMAX;
+
+ /* exactly one must be true */
+ CV_Assert((useLogistic || useSoftmax) && !(useLogistic && useSoftmax));
+
+ cuda4dnn::RegionConfiguration<float> config;
+ config.squash_method = squash_method;
+ config.classes = classes;
+ config.boxes_per_cell = anchors;
+
+ config.height_norm = height_norm;
+ config.width_norm = width_norm;
+
+ config.object_prob_cutoff = (classfix == -1) ? 0.5 : 0.0;
+ config.class_prob_cutoff = thresh;
+
+ config.nms_iou_threshold = nmsThreshold;
+
+ return make_cuda_node<cuda4dnn::RegionOp>(preferableTarget, std::move(context->stream), blobs[0], config);
+ }
+#endif
+
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
//M*/
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reorg.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
#ifdef HAVE_OPENCL
permute->forward(inputs, outputs, internals_arr);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::ReorgOp>(preferableTarget, std::move(context->stream), reorgStride);
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
{
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/dnn/shape_utils.hpp>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine());
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{
// Third party copyrights are property of their respective owners.
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/imgproc.hpp>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv { namespace dnn {
class ResizeLayerImpl : public ResizeLayer
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
+ if (backendId == DNN_BACKEND_CUDA)
+ return interpolation == "nearest" || interpolation == "bilinear";
+
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
{
CV_Error(Error::StsNotImplemented, "Unknown interpolation: " + interpolation);
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ cuda4dnn::InterpolationType itype;
+ if (interpolation == "nearest")
+ itype = InterpolationType::NEAREST_NEIGHBOUR;
+ else if (interpolation == "bilinear")
+ itype = InterpolationType::BILINEAR;
+ else
+ CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
+
+ return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), itype, scaleHeight, scaleWidth);
+ }
+#endif
+
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
{
#ifdef HAVE_INF_ENGINE
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE;
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_INFERENCE_ENGINE ||
+ backendId == DNN_BACKEND_CUDA;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include <opencv2/dnn/shape_utils.hpp>
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/scale_shift.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
- return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
+ backendId == DNN_BACKEND_HALIDE ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && axis == 1);
}
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ CV_Assert(!blobs.empty() || inputs.size() == 2);
+
+ cv::Mat weightsMat = hasWeights ? blobs[0] : Mat();
+
+ /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0]
+ * in either case, it is at the end of the blobs vector => bias = blobs.back()
+ */
+ cv::Mat biasMat = hasBias ? blobs.back() : Mat();
+
+ return make_cuda_node<cuda4dnn::ScaleShiftOp>(preferableTarget, std::move(context->stream), axis, weightsMat, biasMat);
+ }
+#endif
+
virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
{
switch (node->backendId)
// Copyright (C) 2018, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/shuffle_channel.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
namespace cv { namespace dnn {
setParamsFrom(params);
}
+ virtual bool supportBackend(int backendId) CV_OVERRIDE
+ {
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA;
+ }
+
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::ShuffleChannelOp>(preferableTarget, std::move(context->stream), group);
+ }
+#endif
+
private:
Ptr<PermuteLayer> permute;
std::vector<int> permuteInpShape, permuteOutShape;
//M*/
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
#include "../op_inf_engine.hpp"
#include "layers_common.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include "opencl_kernels_dnn.hpp"
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/slice.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE &&
#ifdef HAVE_INF_ENGINE
INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ std::vector<std::vector<std::size_t>> offsets;
+ for (const auto& ranges : sliceRanges)
+ {
+ std::vector<std::size_t> offsets_i;
+ for (const auto& range : ranges)
+ offsets_i.push_back(range.start);
+ offsets.push_back(std::move(offsets_i));
+ }
+
+ return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
+ }
+#endif
+
#ifdef HAVE_INF_ENGINE
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
#include "../precomp.hpp"
#include "layers_common.hpp"
+#include "../op_cuda.hpp"
#include "../op_halide.hpp"
#include "../op_inf_engine.hpp"
#include "../op_vkcom.hpp"
using namespace cv::dnn::ocl4dnn;
#endif
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/softmax.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1) ||
(backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !logSoftMax) ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan());
}
}
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+ auto channel_axis = clamp(axisRaw, input_wrapper->getRank());
+ return make_cuda_node<cuda4dnn::SoftmaxOp>(preferableTarget, std::move(context->cudnn_handle), channel_axis, logSoftMax);
+ }
+#endif
+
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{
#ifdef HAVE_VULKAN
//M*/
#include "../precomp.hpp"
+#include "../op_cuda.hpp"
#include "layers_common.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/split.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
namespace cv
{
namespace dnn
}
}
+ virtual bool supportBackend(int backendId) CV_OVERRIDE
+ {
+ return backendId == DNN_BACKEND_OPENCV ||
+ backendId == DNN_BACKEND_CUDA;
+ }
+
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
inputs[0].copyTo(outputs[i]);
}
}
+
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+ return make_cuda_node<cuda4dnn::SplitOp>(preferableTarget, std::move(context->stream));
+ }
+#endif
+
};
Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_OP_CUDA_HPP
+#define OPENCV_DNN_SRC_OP_CUDA_HPP
+
+#ifdef HAVE_CUDA
+#include "cuda4dnn/csl/stream.hpp"
+#include "cuda4dnn/csl/cublas.hpp"
+#include "cuda4dnn/csl/cudnn.hpp"
+#include "cuda4dnn/csl/tensor.hpp"
+#include "cuda4dnn/csl/memory.hpp"
+#include "cuda4dnn/csl/fp16.hpp"
+#include "cuda4dnn/csl/workspace.hpp"
+#endif
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <memory>
+#include <iterator>
+
+namespace cv { namespace dnn {
+
+ constexpr bool IS_DNN_CUDA_TARGET(int id) {
+ return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA;
+ }
+
+ constexpr bool haveCUDA() {
+#ifdef HAVE_CUDA
+ return true;
+#else
+ return false;
+#endif
+ }
+
+#ifdef HAVE_CUDA
+ namespace cuda4dnn { namespace csl {
+ struct CSLContext {
+ Stream stream;
+ cublas::Handle cublas_handle;
+ cudnn::Handle cudnn_handle;
+ };
+
+ /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied)
+ *
+ * \tparam T element type for the tensor
+ * \param[in] mat cv::Mat from which the shape must be inferred
+ *
+ * \return a Tensor object with the shape of \p mat
+ */
+ template <class T>
+ Tensor<T> makeTensorHeader(const Mat& mat) {
+ auto sizes = shape(mat);
+ return Tensor<T>(std::begin(sizes), std::end(sizes));
+ }
+
+ /** @brief copies data from a cv::Mat to TensorType
+ *
+ * \tparam T the type of the elements contained in TensorType object
+ *
+ * \param[in] srcMat source matrix
+ * \param[out] destTensor destination tensor
+ * \param stream CUDA stream to use for the memory transfer
+ *
+ * The memory copy starts from begining \p srcMat. The number of elements copied is
+ * equal to the number of elements in \p destTensor.
+ *
+ * Pre-conditions:
+ * - \p srcMat must contain elements of type CV_32F
+ * - the size of \p srcMat must be larger than or equal to the size of \p destTensor
+ *
+ * @note best performance when \p srcMat is continuous and page-locked
+ * @note blocks calling thread if \p srcMat is not page-locked
+ */
+ template <class T>
+ void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream);
+
+ template <> inline
+ void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) {
+ /* should perhaps convert cv::Mat of different type to the required type and copy */
+ CV_Assert(srcMat.type() == CV_32F);
+ CV_Assert(srcMat.total() >= destTensor.size());
+
+ Mat temp;
+ srcMat.convertTo(temp, CV_16F);
+ CV_Assert(temp.isContinuous());
+
+ memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream);
+ }
+
+ template <> inline
+ void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) {
+ /* should perhaps convert cv::Mat of different type to the required type and copy */
+ CV_Assert(srcMat.type() == CV_32F);
+ CV_Assert(srcMat.total() >= destTensor.size());
+
+ Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone();
+ CV_Assert(temp.isContinuous());
+
+ memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream);
+ }
+
+ /** @brief copies data from a TensorType to a cv::Mat
+ *
+ * \tparam T the type of the elements contained in TensorType object
+ *
+ * \param[in] srcTensor source tensor
+ * \param[out] destMat destination matrix
+ * \param stream CUDA stream to use for the memory transfer
+ *
+ * The entire memory block held by the \p srcTensor is copied to \p destMat.
+ *
+ * Pre-conditions:
+ * - \p destMat must contain elements of type CV_32F
+ * - the size of \p destMat must be larger than or equal to the size of \p srcTensor
+ *
+ * @note best performance when \p destMat is continuous and page-locked
+ * @note blocks calling thread if \p destMat is not page-locked
+ */
+ template <class T>
+ void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream);
+
+ template <> inline
+ void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) {
+ CV_Assert(destMat.type() == CV_32F);
+ CV_Assert(destMat.total() >= srcTensor.size());
+
+ Mat temp(shape(destMat), CV_16F);
+ CV_Assert(temp.isContinuous());
+
+ memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
+
+ temp.convertTo(destMat, CV_32F);
+ }
+
+ template <> inline
+ void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) {
+ CV_Assert(destMat.type() == CV_32F);
+ CV_Assert(destMat.total() >= srcTensor.size());
+
+ Mat temp = destMat.isContinuous() ? destMat : destMat.clone();
+ CV_Assert(temp.isContinuous());
+
+ memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
+
+ if (temp.data != destMat.data)
+ temp.copyTo(destMat);
+ }
+
+ }} /* namespace cuda4dnn::csl */
+
+ /** base class for CUDA operation nodes (for all supported targets) */
+ class CUDABackendNode : public BackendNode {
+ public:
+ CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { }
+ virtual ~CUDABackendNode() { }
+
+ virtual void forward(
+ const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+ const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+ cuda4dnn::csl::Workspace& workspace) = 0;
+
+ virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; }
+ };
+
+ /** @brief utility function which creates CUDA node of correct type from `targetId`
+ *
+ * CUDA operation nodes take the type of data they operate on as a template parameter.
+ * For example, ConcatOp<float> is an operation node which concats tensors of `float` type
+ * into a tensor of `float` type.
+ *
+ * This utility function aids the creation of nodes of different types and eliminates the
+ * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which
+ * reduces coupling between modules.
+ *
+ * Example:
+ * template <class T>
+ * class ConcatOp : public CUDABackendNode;
+ *
+ * // returns a cv::Ptr to a ConcatOp<half> object
+ * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis);
+ *
+ * // returns a cv::Ptr to a ConcatOp<float> object
+ * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis);
+ */
+ template <template <class> class NodeType, class ...Args>
+ cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) {
+ switch (targetId)
+ {
+ case DNN_TARGET_CUDA_FP16:
+ return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...));
+ case DNN_TARGET_CUDA:
+ return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...));
+ default:
+ CV_Assert(IS_DNN_CUDA_TARGET(targetId));
+ }
+ return Ptr<BackendNode>();
+ }
+
+ /* base class for all CUDA backend/target wrappers */
+ class CUDABackendWrapper : public BackendWrapper {
+ public:
+ CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { }
+ virtual ~CUDABackendWrapper() { }
+
+ void copyToHost() override = 0;
+ void setHostDirty() override = 0;
+
+ virtual void copyToDevice() = 0;
+ virtual void setDeviceDirty() = 0;
+
+ virtual MatShape getShape() const noexcept = 0;
+ virtual std::size_t getRank() const noexcept = 0;
+
+ /** @note setting the stream updates the stream for all wrappers which use the same tensor */
+ virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
+ };
+
+ template <class T, int TargetID>
+ class GenericCUDABackendWrapper final : public CUDABackendWrapper {
+ public:
+ using value_type = T;
+ using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>;
+ using tensor_view_type = cuda4dnn::csl::TensorView<value_type>;
+
+ /* Pre-conditions:
+ * - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m`
+ * - the host memory must remain allocated throughout the lifetime of this object
+ *
+ * Post-conditions:
+ * - the host memory used by \p m "may" be page-locked
+ */
+ GenericCUDABackendWrapper(Mat& m)
+ : CUDABackendWrapper(TargetID)
+ {
+ shape = cv::dnn::shape(m);
+
+ shared_block = std::make_shared<shared_block_type>();
+ shared_block->host_dirty = true;
+ shared_block->device_dirty = false;
+
+ shared_block->host = m;
+
+ try {
+ shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize());
+ } catch (...) {
+ /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
+ /* we ignore the failure as this is just an optimization and not a requirement */
+ }
+
+ shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total());
+ }
+
+ GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_)
+ : CUDABackendWrapper(TargetID)
+ {
+ const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>();
+ CV_Assert(base);
+
+ shape = shape_;
+ shared_block = base->shared_block;
+ }
+
+ static Ptr<BackendWrapper> create(Mat& m) {
+ return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m));
+ }
+
+ static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) {
+ return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape));
+ }
+
+ void copyToHost() override {
+ if (shared_block->device_dirty) {
+ shared_block->host_dirty = false;
+ shared_block->device_dirty = false;
+
+ /* If the wrapper is being reused, the device tensor might be larger in size than the wrapper.
+ * Using the device tensor does not give incorrect code but leads to unused region of memory being copied.
+ *
+ * We use a view to ensure that only the required region of memory is copied.
+ */
+ auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+ cuda4dnn::csl::copyTensorToMat<T>(view, shared_block->host, shared_block->stream);
+
+ shared_block->stream.synchronize();
+ }
+ }
+
+ void setHostDirty() override {
+ shared_block->device_dirty = false;
+ shared_block->host_dirty = true;
+ }
+
+ void copyToDevice() override {
+ if (shared_block->host_dirty) {
+ shared_block->host_dirty = false;
+ shared_block->device_dirty = false;
+
+ auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+ cuda4dnn::csl::copyMatToTensor<T>(shared_block->host, span, shared_block->stream);
+ }
+ }
+
+ void setDeviceDirty() override {
+ shared_block->device_dirty = true;
+ shared_block->host_dirty = false;
+ }
+
+ MatShape getShape() const noexcept override { return shape; }
+
+ std::size_t getRank() const noexcept override { return shape.size(); }
+
+ void setStream(cuda4dnn::csl::Stream stream) noexcept override {
+ shared_block->stream = std::move(stream);
+ }
+
+ cv::Mat getMutableHostMat() noexcept {
+ copyToHost();
+ setHostDirty();
+ return shared_block->host;
+ }
+
+ const cv::Mat getImmutableHostMat() const noexcept {
+ copyToHost();
+ return shared_block->host;
+ }
+
+ /* Optimization Note: use getSpan() and getView() judiciously
+ *
+ * getSpan() is meant to be used when the memory is going to be modified
+ * getView() is meant to be used when the memory is only going to be read
+ *
+ * getSpan() marks the device memory as dirty but getView() does not
+ *
+ * getView() implicitly performs host to device memory transfer if required
+ * getSpan() does not perform any synchronization (use copyToDevice if sync. is required)
+ */
+ tensor_span_type getSpan() noexcept {
+ setDeviceDirty();
+ return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+ }
+
+ tensor_view_type getView() noexcept {
+ copyToDevice();
+ return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
+ }
+
+ private:
+ /* The same tensor memory can be reused by different layers whenever possible.
+ * Hence, it is possible for different backend warppers to point to the same memory.
+ * However, it may use only a part of that memory and have a different shape.
+ *
+ * We store the common information such as device tensor and its corresponding host memory in
+ * a shared block. The shared block is shared by all backend wrappers which use the same memory.
+ * The shape, which can be different for different wrappers, is stored as a member object.
+ */
+
+ MatShape shape;
+
+ struct shared_block_type {
+ bool host_dirty;
+ bool device_dirty;
+
+ cv::Mat host;
+ cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */
+
+ cuda4dnn::csl::ManagedPtr<T> device;
+ cuda4dnn::csl::Stream stream;
+ };
+
+ std::shared_ptr<shared_block_type> shared_block;
+ };
+
+ using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>;
+ using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>;
+
+ template <class T> struct GetCUDABackendWrapperType_ { };
+ template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; };
+ template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; };
+
+ template <class T>
+ using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type;
+
+#endif
+}} /* namespace cv::dnn */
+
+#endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_alexnet.yml" :
"dnn/halide_scheduler_alexnet.yml");
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, ResNet_50)
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_resnet_50.yml" :
"dnn/halide_scheduler_resnet_50.yml");
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml" :
"dnn/halide_scheduler_squeezenet_v1_1.yml");
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, GoogLeNet)
processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
Size(224, 224), "prob");
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, Inception_5h)
"dnn/halide_scheduler_inception_5h.yml",
l1, lInf);
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, ENet)
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
"dnn/halide_scheduler_enet.yml",
2e-5, 0.15);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
processNet("dnn/openpose_pose_coco.caffemodel", "dnn/openpose_pose_coco.prototxt",
Size(46, 46), "", "", l1, lInf);
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
Size(46, 46), "", "", l1, lInf);
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt",
Size(46, 46));
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, OpenFace)
const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.0024 : 0.0;
const float lInf = (target == DNN_TARGET_MYRIAD) ? 0.0071 : 0.0;
processNet("dnn/openface_nn4.small2.v1.t7", "", Size(96, 96), "", "", l1, lInf);
+
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, opencv_face_detector)
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "", l1, lInf);
if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
expectNoFallbacksFromIE(net);
+ expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
#if defined(HAVE_INF_ENGINE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
expectNoFallbacksFromIE(net);
#endif
+ expectNoFallbacksFromCUDA(net);
}
-INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true));
+INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true, true));
}} // namespace
#define CV_TEST_TAG_DNN_SKIP_VULKAN "dnn_skip_vulkan"
+#define CV_TEST_TAG_DNN_SKIP_CUDA "dnn_skip_cuda"
+#define CV_TEST_TAG_DNN_SKIP_CUDA_FP16 "dnn_skip_cuda_fp16"
+#define CV_TEST_TAG_DNN_SKIP_CUDA_FP32 "dnn_skip_cuda_fp32"
namespace cv { namespace dnn {
CV__DNN_INLINE_NS_BEGIN
bool withInferenceEngine = true,
bool withHalide = false,
bool withCpuOCV = true,
- bool withVkCom = true
+ bool withVkCom = true,
+ bool withCUDA = true
);
static void getDefaultThresholds(int backend, int target, double* l1, double* lInf)
{
- if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+ if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
{
*l1 = 4e-3;
*lInf = 2e-2;
expectNoFallbacks(net);
}
+ void expectNoFallbacksFromCUDA(Net& net)
+ {
+ if (backend == DNN_BACKEND_CUDA)
+ expectNoFallbacks(net);
+ }
+
protected:
void checkBackend(Mat* inp = 0, Mat* ref = 0)
{
case DNN_BACKEND_INFERENCE_ENGINE: *os << "DLIE"; return;
case DNN_BACKEND_VKCOM: *os << "VKCOM"; return;
case DNN_BACKEND_OPENCV: *os << "OCV"; return;
+ case DNN_BACKEND_CUDA: *os << "CUDA"; return;
} // don't use "default:" to emit compiler warnings
*os << "DNN_BACKEND_UNKNOWN(" << (int)v << ")";
}
case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return;
case DNN_TARGET_VULKAN: *os << "VULKAN"; return;
case DNN_TARGET_FPGA: *os << "FPGA"; return;
+ case DNN_TARGET_CUDA: *os << "CUDA"; return;
+ case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return;
} // don't use "default:" to emit compiler warnings
*os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
}
bool withInferenceEngine /*= true*/,
bool withHalide /*= false*/,
bool withCpuOCV /*= true*/,
- bool withVkCom /*= true*/
+ bool withVkCom /*= true*/,
+ bool withCUDA /*= true*/
)
{
#ifdef HAVE_INF_ENGINE
for (std::vector< Target >::const_iterator i = available.begin(); i != available.end(); ++i)
targets.push_back(make_tuple(DNN_BACKEND_VKCOM, *i));
}
+
+#ifdef HAVE_CUDA
+ if(withCUDA)
+ {
+ //for (auto target : getAvailableTargets(DNN_BACKEND_CUDA))
+ // targets.push_back(make_tuple(DNN_BACKEND_CUDA, target));
+ targets.push_back(make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
+ }
+#endif
+
{
available = getAvailableTargets(DNN_BACKEND_OPENCV);
for (std::vector< Target >::const_iterator i = available.begin(); i != available.end(); ++i)
CV_TEST_TAG_DNN_SKIP_VULKAN
);
#endif
+
+#ifdef HAVE_CUDA
+ registerGlobalSkipTag(
+ CV_TEST_TAG_DNN_SKIP_CUDA, CV_TEST_TAG_DNN_SKIP_CUDA_FP32, CV_TEST_TAG_DNN_SKIP_CUDA_FP16
+ );
+#endif
}
} // namespace
TEST_P(Test_Caffe_layers, MVN)
{
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* MVN is unsupported */
+
testLayerUsingCaffeModels("layer_mvn");
}
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_INFERENCE_ENGINE)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE);
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* Proposal layer is unsupported */
Net net = readNetFromCaffe(_tf("net_faster_rcnn_proposal.prototxt"));
const int target = get<1>(get<3>(GetParam()));
const bool kSwapRB = true;
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_VKCOM && dtype != CV_32F)
TEST_P(Test_ONNX_layers, InstanceNorm)
{
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* MVN is not supported */
+
if (target == DNN_TARGET_MYRIAD)
testONNXModels("instancenorm", npy, 0, 0, false, false);
else
TEST_P(Test_ONNX_layers, Convolution3D)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
- throw SkipTestException("Test is enabled starts from 2019R1");
+ if(backend == DNN_BACKEND_INFERENCE_ENGINE)
+ throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
testONNXModels("conv3d");
testONNXModels("conv3d_bias");
}
#if defined(INF_ENGINE_RELEASE)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_2018R5);
#endif
- if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
- throw SkipTestException("Only DLIE backend on CPU is supported");
+ if ((backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU) && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only DLIE backend on CPU, and CUDA is supported");
testONNXModels("deconv3d");
testONNXModels("deconv3d_bias");
testONNXModels("deconv3d_pad");
TEST_P(Test_ONNX_layers, ReduceMean3D)
{
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
testONNXModels("reduce_mean3d");
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
testONNXModels("max_pool3d", npy, 0, 0, false, false);
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
testONNXModels("ave_pool3d");
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
testONNXModels("pool_conv_3d");
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
String onnxmodel = findDataFile("dnn/resnet-34_kinetics.onnx", false);
Mat image0 = imread(findDataFile("dnn/dog416.png"));
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
runTensorFlowNet("conv3d");
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
runTensorFlowNet("max_pool3d");
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
throw SkipTestException("Test is enabled starts from 2019R1");
#endif
- if (target != DNN_TARGET_CPU)
- throw SkipTestException("Only CPU is supported");
+ if (target != DNN_TARGET_CPU && backend != DNN_BACKEND_CUDA)
+ throw SkipTestException("Only CPU and CUDA is supported");
runTensorFlowNet("ave_pool3d");
}
TEST_P(Test_TensorFlow_layers, deconvolution)
{
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* bugged */
+
runTensorFlowNet("deconvolution");
runTensorFlowNet("deconvolution_same");
runTensorFlowNet("deconvolution_stride_2_same");
TEST_P(Test_TensorFlow_layers, lstm)
{
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* not supported */
if (backend == DNN_BACKEND_INFERENCE_ENGINE)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
TEST_P(Test_Torch_layers, net_normalize)
{
+ if(backend == DNN_BACKEND_CUDA)
+ applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* only L1 and L2 norms are supported */
runTorchNet("net_normalize", "", false, true);
}