From e54e95a314f59a06593aa551eab8eac0c6502248 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Prasanna=20R/System=20SW=20/SRI-Bangalore/Engineer/?= =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Tue, 6 Nov 2018 06:11:53 +0530 Subject: [PATCH] Enable broadcast support for Equal_Ex op (#3431) This patch enables broadcast support for Equal_Ex op. Related issue: #3295. Signed-off-by: prasannar --- .../arm_compute/core/CL/kernels/CLEqualKernel.h | 2 + .../src/core/CL/kernels/CLEqualKernel.cpp | 125 ++++++++++++++++++--- .../src/runtime/CL/functions/CLEqual.cpp | 14 +++ runtimes/pure_arm_compute/src/compilation.cc | 10 ++ 4 files changed, 136 insertions(+), 15 deletions(-) diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEqualKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEqualKernel.h index e20fda7..847beec 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEqualKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEqualKernel.h @@ -48,6 +48,8 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + private: const ICLTensor *_input1; const ICLTensor *_input2; diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEqualKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEqualKernel.cpp index 2348052..7777c59 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLEqualKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEqualKernel.cpp @@ -22,23 +22,51 @@ using namespace arm_compute; +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; +} + CLEqualKernel::CLEqualKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {} +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, + DataType::QS16, DataType::S16, DataType::F16, + DataType::F32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, + DataType::QS16, DataType::S16, DataType::F16, + DataType::F32, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, + DataType::F32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} + void CLEqualKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input1->info()->tensor_shape(), - input2->info()->tensor_shape()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input1->info()->tensor_shape(), - output->info()->tensor_shape()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); _input1 = input1; _input2 = input2; _output = output; - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Create kernel std::string kernel_name = "equal"; std::set build_opts; @@ -63,17 +91,44 @@ void CLEqualKernel::configure(const ICLTensor *input1, const ICLTensor *input2, _kernel = static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); - // Configure window - Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input1->info()->data_type() == DataType::S16 || + input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if (input1->info()->data_type() == DataType::F16 && + input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - ValidRegion valid_region = - intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region()); - - update_window_and_padding(win, input1_access, input2_access, output_access); + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); output_access.set_valid_region(win, valid_region); @@ -85,15 +140,55 @@ void CLEqualKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice); - add_3D_tensor_argument(idx, _input2, slice); + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); } while (collapsed.slide_window_slice_3D(slice)); } + +BorderSize CLEqualKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEqual.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEqual.cpp index 7881e3d..2b6a994 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLEqual.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEqual.cpp @@ -18,6 +18,10 @@ #include "arm_compute/core/CL/kernels/CLEqualKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/ToolchainSupport.h" +#include + using namespace arm_compute; void CLEqual::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) @@ -25,4 +29,14 @@ void CLEqual::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) auto k = arm_compute::support::cpp14::make_unique(); k->configure(input1, input2, output); _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } } diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index afbd8e7..eafd239 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -3576,6 +3576,16 @@ void Planner::visit(const ::internal::tflite::op::Equal::Node &node) asTensorInfo(asTensorShape(_ctx.at(output_index).shape(), false), _ctx.at(output_index).type(), _ctx.at(output_index).scale(), _ctx.at(output_index).zeroPoint())); + + if (!(_ctx.at(input1_index).shape() == _ctx.at(input2_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input1_index).shape().rank(), _ctx.at(input2_index).shape().rank()); + const_cast<::internal::tflite::operand::Shape &>(_ctx.at(input1_index).shape()) + .extendRank(broadcast_rank); + const_cast<::internal::tflite::operand::Shape &>(_ctx.at(input2_index).shape()) + .extendRank(broadcast_rank); + } _builder.addShapeConstr(input1_index, asTensorInfo(asTensorShape(_ctx.at(input1_index).shape(), false), _ctx.at(input1_index).type(), _ctx.at(input1_index).scale(), -- 2.7.4