From 505dde09de0b245bb9958d42673671761a690be1 Mon Sep 17 00:00:00 2001 From: YashasSamaga Date: Mon, 4 Oct 2021 12:38:45 +0530 Subject: [PATCH] support broadcasting in eltwise ops --- modules/dnn/src/cuda/eltwise_ops.cu | 271 ++++++++++++++++++++--- modules/dnn/src/cuda/kernel_dispatcher.hpp | 18 ++ modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp | 14 +- modules/dnn/src/dnn.cpp | 14 +- 4 files changed, 282 insertions(+), 35 deletions(-) diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 109c3fb..db4daa4 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -5,13 +5,16 @@ #include #include +#include "array.hpp" #include "functors.hpp" #include "grid_stride_range.hpp" #include "execution.hpp" #include "vector_traits.hpp" +#include "kernel_dispatcher.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/csl/tensor.hpp" #include @@ -40,6 +43,32 @@ namespace raw { v_store(output_vPtr[i], vec_x); } } + + template + __global__ void eltwise_op_bcast( + Span output, array out_strides, + View x, array x_strides, array x_bcast, + View y, array y_strides, array y_bcast, + const typename EltwiseOp::Params params) { + EltwiseOp eltwise_op(params); + + for (auto i : grid_stride_range(output.size())) { + index_type out_index = i / out_strides[0]; + index_type x_index = x_bcast[0] ? 0 : out_index * x_strides[0]; + index_type y_index = y_bcast[0] ? 0 : out_index * y_strides[0]; + + for (int j = 1; j < Rank; j++) + { + out_index = (i % out_strides[j - 1]) / out_strides[j]; + if (!x_bcast[j]) + x_index += out_index * x_strides[j]; + if (!y_bcast[j]) + y_index += out_index * y_strides[j]; + } + + output[i] = eltwise_op(x[x_index], y[y_index]); + } + } } template static @@ -55,63 +84,251 @@ void launch_vectorized_eltwise_op(const Stream& stream, Span output, View launch_kernel(kernel, policy, output, x, y, params); } +template static +void launch_eltwise_op_bcast( + const Stream& stream, + Span output, const std::vector& outStride, + View x, const std::vector& inStride1, const std::vector& inBcast1, + View y, const std::vector& inStride2, const std::vector& inBcast2, + const typename EltwiseOp::Params& params) +{ + CV_Assert(outStride.size() == Rank); + CV_Assert(inStride1.size() == Rank); + CV_Assert(inStride2.size() == Rank); + CV_Assert(inBcast1.size() == Rank); + CV_Assert(inBcast2.size() == Rank); + + array outStride_k, inStride1_k, inStride2_k; + outStride_k.assign(std::begin(outStride), std::end(outStride)); + inStride1_k.assign(std::begin(inStride1), std::end(inStride1)); + inStride2_k.assign(std::begin(inStride2), std::end(inStride2)); + + array inBcast1_k, inBcast2_k; + inBcast1_k.assign(std::begin(inBcast1), std::end(inBcast1)); + inBcast2_k.assign(std::begin(inBcast2), std::end(inBcast2)); + + auto kernel = raw::eltwise_op_bcast; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, outStride_k, x, inStride1_k, inBcast1_k, y, inStride2_k, inBcast2_k, params); +} + +GENERATE_KERNEL_DISPATCHER_2TP(eltwise_op_bcast_dispatcher, launch_eltwise_op_bcast); + template static -void eltwise_op(const Stream& stream, Span output, View x, View y, const typename EltwiseOp::Params& params = {}) { - CV_Assert(x.size() == y.size()); - CV_Assert(x.size() == output.size()); +void eltwise_op(const Stream& stream, TensorSpan output, TensorView x, TensorView y, const typename EltwiseOp::Params& params = {}) { + if (is_shape_same(output, x) && is_shape_same(output, y)) + { + /* no broadcasting; use fast path */ + CV_Assert(x.size() == y.size()); + CV_Assert(x.size() == output.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(x, 4) && is_fully_aligned(y, 4)) { + launch_vectorized_eltwise_op(stream, output, x, y, params); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(x, 2) && is_fully_aligned(y, 2)) { + launch_vectorized_eltwise_op(stream, output, x, y, params); + } else { + launch_vectorized_eltwise_op(stream, output, x, y, params); + } + } + else + { + CV_Assert(is_shape_compatible(output, x)); + CV_Assert(is_shape_compatible(output, y)); + + /* matching singleton axes in both input tensors can be eliminated + * + * Reasoning: + * ---------- + * Singleton axes do not contribute towards address calculation. They are redundant + * unless there is broadcasting. If both input tensors have singleton axis at a + * specified position, there is no broadcasting on that axis. + * + * Example: + * --------- + * x: [1, 256, 32, 32] -> [256, 32, 32] + * y: [1, 256, 1, 1] -> [256, 1, 1] + */ + for (int r = 0; r < output.rank(); r++) + { + while (x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) { + CV_Assert(output.get_axis_size(r) == 1); + + x.squeeze(r); + y.squeeze(r); + output.squeeze(r); + } + } + + auto inShape1 = x.shape_as_vector(); + auto inShape2 = y.shape_as_vector(); + auto outShape = output.shape_as_vector(); + + /* contiguous axes that do not broadcast can be merged into one axis + * + * Example: + * --------- + * x: [32, 8, 8] -> [32, 64] + * y: [1, 8, 8] -> [1, 64] + */ + for (int i = 0; i < inShape1.size(); i++) { + /* check if axis `i` requires any broadcasting */ + if (inShape1[i] == inShape2[i]) { + /* loop invariant: `i` is the first axis in the contiguous axis sequence */ + + int j = i + 1; /* `j` is the axis which we will attempt to merge */ + while (j < inShape1.size() && inShape1[j] == inShape2[j]) { + CV_Assert(outShape[j] == inShape1[j]); + + /* `j` axis is also used fully; merge `i` and `j` */ + auto new_size = inShape1[i] * inShape1[j]; + inShape1[i] = new_size; + inShape2[i] = new_size; + + /* delete axis `j` */ + inShape1.erase(std::begin(inShape1) + j); + inShape2.erase(std::begin(inShape2) + j); + outShape.erase(std::begin(outShape) + j); + + /* optimizations should not break the invariants */ + CV_Assert(inShape1.size() == outShape.size()); + CV_Assert(inShape2.size() == outShape.size()); + CV_Assert(inShape1[i] == outShape[i]); + CV_Assert(inShape2[i] == outShape[i]); + } + } + } + + /* contiguous broadcasting axes on the same tensor can be merged into one axis + * + * Example: + * --------- + * x: [256, 8, 8] -> [256, 64] + * y: [256, 1, 1] -> [256, 1] + */ + for (int i = 0; i < inShape1.size(); i++) { + /* check if axis `i` requires any broadcasting in tensor 1 */ + if (inShape1[i] == 1 && inShape2[i] != 1) { + /* loop invariant: `i` is the first axis in the contiguous axis sequence */ + + int j = i + 1; /* `j` is the axis which we will attempt to merge */ + while (j < inShape1.size() && inShape1[j] == 1 && inShape2[j] != 1) { + CV_Assert(outShape[j] == inShape2[j]); + + /* `j` axis is also used fully; merge `i` and `j` */ + inShape1[i] = 1; + inShape2[i] = inShape2[i] * inShape2[j]; + outShape[i] = inShape2[i]; + + /* delete axis `j` */ + inShape1.erase(std::begin(inShape1) + j); + inShape2.erase(std::begin(inShape2) + j); + outShape.erase(std::begin(outShape) + j); + + /* optimizations should not break the invariants */ + CV_Assert(inShape1.size() == outShape.size()); + CV_Assert(inShape2.size() == outShape.size()); + CV_Assert(inShape1[i] == 1); + CV_Assert(inShape2[i] == outShape[i]); + } + } + + /* check if axis `i` requires any broadcasting in tensor 2 */ + if (inShape1[i] != 1 && inShape2[i] == 1) { + /* loop invariant: `i` is the first axis in the contiguous axis sequence */ + + int j = i + 1; /* `j` is the axis which we will attempt to merge */ + while (j < inShape1.size() && inShape1[j] != 1 && inShape2[j] == 1) { + CV_Assert(outShape[j] == inShape1[j]); + + /* `j` axis is also used fully; merge `i` and `j` */ + inShape1[i] = inShape1[i] * inShape1[j]; + inShape2[i] = 1; + outShape[i] = inShape1[i]; + + /* delete axis `j` */ + inShape1.erase(std::begin(inShape1) + j); + inShape2.erase(std::begin(inShape2) + j); + outShape.erase(std::begin(outShape) + j); + + /* optimizations should not break the invariants */ + CV_Assert(inShape1.size() == outShape.size()); + CV_Assert(inShape2.size() == outShape.size()); + CV_Assert(inShape1[i] == outShape[i]); + CV_Assert(inShape2[i] == 1); + } + } + } + + auto rank = outShape.size(); + + std::vector inStride1(rank), inStride2(rank), outStride(rank); + inStride1.back() = 1; + inStride2.back() = 1; + outStride.back() = 1; + /* garbage, ..., garbage, 1 */ + + std::copy(std::begin(inShape1) + 1, std::end(inShape1), std::begin(inStride1)); + std::copy(std::begin(inShape2) + 1, std::end(inShape2), std::begin(inStride2)); + std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); + /* dim[0], dim[1], ..., dim[-1], 1 */ + + std::partial_sum(inStride1.rbegin(), inStride1.rend(), inStride1.rbegin(), std::multiplies()); + std::partial_sum(inStride2.rbegin(), inStride2.rend(), inStride2.rbegin(), std::multiplies()); + std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); + /* stride[0], stride[1], ..., stride[-2], 1 */ + + std::vector inBcast1(rank), inBcast2(rank); + std::transform(std::begin(inShape1), std::end(inShape1), std::begin(inBcast1), [](std::size_t sz) { return sz == 1; }); + std::transform(std::begin(inShape2), std::end(inShape2), std::begin(inBcast2), [](std::size_t sz) { return sz == 1; }); - if (is_fully_aligned(output, 4) && is_fully_aligned(x, 4) && is_fully_aligned(y, 4)) { - launch_vectorized_eltwise_op(stream, output, x, y, params); - } else if (is_fully_aligned(output, 2) && is_fully_aligned(x, 2) && is_fully_aligned(y, 2)) { - launch_vectorized_eltwise_op(stream, output, x, y, params); - } else { - launch_vectorized_eltwise_op(stream, output, x, y, params); + CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK); + eltwise_op_bcast_dispatcher(rank, stream, output, outStride, x, inStride1, inBcast1, y, inStride2, inBcast2, params); } } template -void eltwise_max_2(const Stream& stream, Span output, View x, View y) { +void eltwise_max_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { eltwise_op>(stream, output, x, y); } template -void eltwise_min_2(const Stream& stream, Span output, View x, View y) { +void eltwise_min_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { eltwise_op>(stream, output, x, y); } template -void eltwise_sum_2(const Stream& stream, Span output, View x, View y) { +void eltwise_sum_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { eltwise_op>(stream, output, x, y); } template -void eltwise_sum_coeff_2(const Stream& stream, Span output, T coeff_x, View x, T coeff_y, View y) { +void eltwise_sum_coeff_2(const Stream& stream, TensorSpan output, T coeff_x, TensorView x, T coeff_y, TensorView y) { eltwise_op>(stream, output, x, y, {coeff_x, coeff_y}); } template -void eltwise_prod_2(const Stream& stream, Span output, View x, View y) { +void eltwise_prod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { eltwise_op>(stream, output, x, y); } template -void eltwise_div_2(const Stream& stream, Span output, View x, View y) { +void eltwise_div_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { eltwise_op>(stream, output, x, y); } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) - template void eltwise_div_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); - template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); - template void eltwise_sum_coeff_2(const Stream&, Span<__half>, __half, View<__half>, __half, View<__half>); - template void eltwise_sum_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); - template void eltwise_max_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); - template void eltwise_min_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); + template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); + template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); + template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>); + template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); + template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); + template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); #endif - template void eltwise_div_2(const Stream& stream, Span output, View x, View y); - template void eltwise_prod_2(const Stream& stream, Span output, View x, View y); - template void eltwise_sum_coeff_2(const Stream&, Span, float, View, float, View); - template void eltwise_sum_2(const Stream& stream, Span output, View x, View y); - template void eltwise_max_2(const Stream& stream, Span output, View x, View y); - template void eltwise_min_2(const Stream& stream, Span output, View x, View y); + template void eltwise_div_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_prod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_sum_coeff_2(const Stream&, TensorSpan, float, TensorView, float, TensorView); + template void eltwise_sum_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_max_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_min_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/kernel_dispatcher.hpp b/modules/dnn/src/cuda/kernel_dispatcher.hpp index 6eff834..b0fc658 100644 --- a/modules/dnn/src/cuda/kernel_dispatcher.hpp +++ b/modules/dnn/src/cuda/kernel_dispatcher.hpp @@ -73,4 +73,22 @@ name(selector, std::forward(args)...); \ } +// Same as GENERATE_KERNEL_DISPATCHER but takes two class template parameters T and TP1 instead of just T +#define GENERATE_KERNEL_DISPATCHER_2TP(name,func); \ + template static \ + typename std::enable_if \ + ::type name(int selector, Args&& ...args) { \ + if(selector == start) \ + func(std::forward(args)...); \ + } \ + \ + template static \ + typename std::enable_if \ + ::type name(int selector, Args&& ...args) { \ + if(selector == start) \ + func(std::forward(args)...); \ + else \ + name(selector, std::forward(args)...); \ + } + #endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp index 096ba04..0e44372 100644 --- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp @@ -6,29 +6,29 @@ #define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP #include "../csl/stream.hpp" -#include "../csl/span.hpp" +#include "../csl/tensor.hpp" #include namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template - void eltwise_max_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + void eltwise_max_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); template - void eltwise_min_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + void eltwise_min_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); template - void eltwise_sum_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + void eltwise_sum_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); template - void eltwise_sum_coeff_2(const csl::Stream& stream, csl::Span output, T coeff_x, csl::View x, T coeff_y, csl::View y); + void eltwise_sum_coeff_2(const csl::Stream& stream, csl::TensorSpan output, T coeff_x, csl::TensorView x, T coeff_y, csl::TensorView y); template - void eltwise_prod_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + void eltwise_prod_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); template - void eltwise_div_2(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 4e38b03..c8fc772 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2710,7 +2710,19 @@ struct Net::Impl : public detail::NetImplBase // we create a temporary backend node for eltwise layer to obtain the eltwise configuration cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers); - const auto eltwiseNode = node.dynamicCast(); + auto eltwiseNode = node.dynamicCast(); + + // broadcasting not supported in fused ops + auto required_shape = shape(nextData->outputBlobs[0]); + for (int i = 0; i < nextData->inputBlobs.size(); i++) + { + if (shape(*nextData->inputBlobs[i]) != required_shape) + { + eltwiseNode.reset(); + break; + } + } + // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used. // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors. if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) -- 2.7.4