Support Mean op for acl_neon backend (#5707)

author 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>

Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)

committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)
author 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)
committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)
diff --git a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h

new file mode 100644 (file)

index 0000000..7209acf
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceMeanEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceMeanEx
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp

new file mode 100644 (file)

index 0000000..c65e935
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels =
+      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+  _reduced_outs =
+      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(output->info()->data_layout()));
+      _memory_group.manage(_reduced_outs.get() + i);
+      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+                                      ReductionOperation::MEAN_SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+  }
+}
+
+void NEReduceMeanEx::run()
+{
+  _memory_group.acquire();
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+  _memory_group.release();
+}
diff --git a/runtimes/neurun/backend/acl_neon/ConstantInitializer.cc b/runtimes/neurun/backend/acl_neon/ConstantInitializer.cc

index 7366582..4ed578f 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/ConstantInitializer.cc
+++ b/runtimes/neurun/backend/acl_neon/ConstantInitializer.cc
@@ -124,6 +124,13 @@ void ConstantInitializer::visit(const model::operation::MaxPool2DNode &node)
    registerPermuteInitializer(input_index, input_obj);
  }
  
+void ConstantInitializer::visit(const model::operation::MeanNode &node)
+{
+  const auto &input_index = node.getInputs().at(model::operation::MeanNode::INPUT);
+  const auto &input_obj = _operands.at(input_index);
+  registerPermuteInitializer(input_index, input_obj);
+}
+
  void ConstantInitializer::visit(const model::operation::MulNode &node)
  {
    const auto &lhs_index = node.getInputs().at(model::operation::MulNode::LHS);
diff --git a/runtimes/neurun/backend/acl_neon/ConstantInitializer.h b/runtimes/neurun/backend/acl_neon/ConstantInitializer.h

index 62547d9..91c818f 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/ConstantInitializer.h
+++ b/runtimes/neurun/backend/acl_neon/ConstantInitializer.h
@@ -45,6 +45,7 @@ public:
    void visit(const model::operation::DepthwiseConv2DNode &) override;
    void visit(const model::operation::FullyConnectedNode &) override;
    void visit(const model::operation::MaxPool2DNode &) override;
+  void visit(const model::operation::MeanNode &) override;
    void visit(const model::operation::MulNode &) override;
    void visit(const model::operation::ReshapeNode &) override;
    void visit(const model::operation::RSQRTNode &) override;
diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc

index 511a9d1..1f57e75 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc
+++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc
@@ -26,6 +26,7 @@
  #include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
  #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
@@ -280,6 +281,95 @@ void KernelGenerator::visit(const model::operation::MaxPool2DNode &node)
    ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
  }
  
+void KernelGenerator::visit(const model::operation::MeanNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::MeanNode::Input::INPUT)};
+
+  const auto axis_index{node.param().axis_index};
+  const auto keep_dims_index{node.param().keep_dims_index};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  std::set<uint32_t> axis;
+  {
+    const auto ifm_rank = ifm_shape.rank();
+    const auto axis_shape = _ctx.at(axis_index).shape();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        auto axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axis.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.insert(
+              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+    }
+  }
+
+  const auto ifm_rank = ifm_shape.rank();
+
+  bool keep_dims = _ctx.at(keep_dims_index).asScalar<int32_t>() != 0;
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  arm_compute::Coordinates fixed_axis;
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  // CWHN -> WHCN
+  uint32_t permutation[4] = {2, 0, 1, 3};
+  for (auto a : axis)
+  {
+    if (acl_layout == ::arm_compute::DataLayout::NCHW && ifm_rank == 4)
+    {
+      fixed_axis.set(fixed_axis.num_dimensions(), permutation[a]);
+    }
+    else
+    {
+      fixed_axis.set(fixed_axis.num_dimensions(), a);
+    }
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  // NOTE NEReduceMean has a bug that does not support NHWC layout
+  //      NEReduceMean intermediate tensors are always NCHW layout
+  auto l = nnfw::cpp14::make_unique<::arm_compute::NEReduceMeanEx>();
+
+  l->configure(ifm_alloc->handle(), fixed_axis, keep_dims, ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
  void KernelGenerator::visit(const model::operation::AvgPool2DNode &node)
  {
    const auto ofm_index{node.getOutputs().at(0)};
diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.h b/runtimes/neurun/backend/acl_neon/KernelGenerator.h

index d2361a9..ecb05c3 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/KernelGenerator.h
+++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.h
@@ -40,6 +40,7 @@ public:
    void visit(const model::operation::Conv2DNode &) override;
    void visit(const model::operation::DepthwiseConv2DNode &) override;
    void visit(const model::operation::MaxPool2DNode &) override;
+  void visit(const model::operation::MeanNode &) override;
    void visit(const model::operation::AvgPool2DNode &) override;
    void visit(const model::operation::ConcatNode &) override;
    void visit(const model::operation::FullyConnectedNode &) override;
diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc

index ee2e650..b268527 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc
+++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc
@@ -64,6 +64,8 @@ void ShapeFixer::visit(const model::operation::DepthwiseConv2DNode &) { /* DO NO
  
  void ShapeFixer::visit(const model::operation::MaxPool2DNode &) { /* DO NOTHING */}
  
+void ShapeFixer::visit(const model::operation::MeanNode &) { /* DO NOTHING */}
+
  void ShapeFixer::visit(const model::operation::AvgPool2DNode &) { /* DO NOTHING */}
  
  void ShapeFixer::visit(const model::operation::ConcatNode &node)
diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.h b/runtimes/neurun/backend/acl_neon/ShapeFixer.h

index 897e504..36eb47a 100644 (file)
--- a/runtimes/neurun/backend/acl_neon/ShapeFixer.h
+++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.h
@@ -40,6 +40,7 @@ public:
    void visit(const model::operation::Conv2DNode &) override;
    void visit(const model::operation::DepthwiseConv2DNode &) override;
    void visit(const model::operation::MaxPool2DNode &) override;
+  void visit(const model::operation::MeanNode &) override;
    void visit(const model::operation::AvgPool2DNode &) override;
    void visit(const model::operation::ConcatNode &) override;
    void visit(const model::operation::FullyConnectedNode &) override;
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon

index b1b8784..5d4c9bf 100644 (file)
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -49,7 +49,6 @@ GeneratedTests.relu6*
  GeneratedTests.relu*
  GeneratedTests.resize_bilinear*
  GeneratedTests.rnn*
-GeneratedTests.mean*
  GeneratedTests.pad*
  GeneratedTests.space_to_depth*
  GeneratedTests.sqrt_ex*
author	장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
	Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)
committer	오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Fri, 19 Jul 2019 08:20:18 +0000 (17:20 +0900)
runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h	[new file with mode: 0644]	patch \| blob
runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp	[new file with mode: 0644]	patch \| blob
runtimes/neurun/backend/acl_neon/ConstantInitializer.cc		patch \| blob \| history
runtimes/neurun/backend/acl_neon/ConstantInitializer.h		patch \| blob \| history
runtimes/neurun/backend/acl_neon/KernelGenerator.cc		patch \| blob \| history
runtimes/neurun/backend/acl_neon/KernelGenerator.h		patch \| blob \| history
runtimes/neurun/backend/acl_neon/ShapeFixer.cc		patch \| blob \| history
runtimes/neurun/backend/acl_neon/ShapeFixer.h		patch \| blob \| history
tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon		patch \| blob \| history