Support ReduceMax operation up to 4 dimetions (#2850)

author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>

Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)

committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)
author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)
committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h

index a7d96cc..ad2b5d0 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
@@ -42,23 +42,22 @@ public:
    CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
    /** Initialise the kernel's input, output and border mode.
     *
-   * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] axis            Axis to reduce
-   * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
-   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in]  input          An input tensor. Data types supported: QASYMM8/S32/F32.
+   * @param[in]  reduce_axis    Axises to reduce
+   * @param[out] output         The output tensor, Data types supported: same as @p input.
     */
-  void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
+  void configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
    /** Static function to check if given info will lead to a valid configuration of @ref
     * CLReduceMaxKernel
     *
-   * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] axis            Axis to reduce
-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in] input           An input tensor info. Data types supported: QASYMM8/S32/F32.
+   * @param[in] reduce_axis     Axises to reduce
+   * @param[in] output          The output tensor info, Data types supported: same as @p input.
     *
     * @return a status
     */
-  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+                         const ITensorInfo *output);
  
    // Inherited methods overridden:
    void run(const Window &window, cl::CommandQueue &queue) override;
@@ -67,7 +66,7 @@ public:
  private:
    const ICLTensor *_input;
    ICLTensor *_output;
-  int32_t _axis;
+  std::vector<uint32_t> _reduce_axis;
  };
  } // namespace arm_compute
  #endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h

index 14b473f..c88bb78 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
@@ -48,21 +48,22 @@ public:
     * @note When locations of min and max occurrences are requested, the reported number of locations
     * is limited to the given array size.
     *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.
+   * @param[in]  input        Input tensor. Data types supported: QASYMM8/S32/F32
+   * @param[in]  reduce_axis  Axises to reduce.
+   * @param[out] output       Output tensor. Data types supported: Same as @p input.
     */
-  void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
+  void configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
    /** Static function to check if given info will lead to a valid configuration of @ref
     * CLPixelWiseDivision
     *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.     *
+   * @param[in]  input        Input tensor info. Data types supported: QASYMM8/S32/F32
+   * @param[in]  reduce_axis  Axises to reduce.
+   * @param[out] output       Output tensor info. Data types supported: Same as @p input.
     *
     * @return a status
     */
-  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+                         const ITensorInfo *output);
  
    // Inherited methods overridden:
    void run() override;
@@ -70,10 +71,9 @@ public:
  private:
    void run_on_cpu();
  
-  int32_t _axis;
-
    ICLTensor *_input;
    ICLTensor *_output;
+  std::vector<uint32_t> _reduce_axis;
  
    std::unique_ptr<ICLKernel> _kernel;
  };
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp

index 168b246..51318d7 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
@@ -38,7 +38,8 @@ namespace
  {
  constexpr unsigned int num_elems_processed_per_iteration = 16;
  
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+                          const ITensorInfo *output)
  {
    // We can handle for simple case only
    // Input rank: 2
@@ -46,7 +47,6 @@ Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorI
    // Axis: one axis value, restrict to 1
  
    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
  
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
                                    "Inputs are not broadcast compatible");
@@ -63,21 +63,30 @@ Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorI
                                      "Only support for input dimension 2");
    }
  
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  for (size_t i = 0; i < reduce_axis.size(); ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions,
+        "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
+  }
+
    return Status{};
  }
  
  } // namespace
  
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
+CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {}
  
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
+void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis,
+                                  ICLTensor *output)
  {
    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info()));
  
    _input = input;
    _output = output;
-  _axis = axis;
+  _reduce_axis = reduce_axis;
  
    // Configure kernel window
    int cols = _input->info()->tensor_shape()[0];
@@ -100,11 +109,12 @@ void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTenso
    ICLKernel::configure(win);
  }
  
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
+Status CLReduceMaxKernel::validate(const ITensorInfo *input,
+                                   const std::vector<uint32_t> &reduce_axis,
                                     const ITensorInfo *output)
  {
    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output));
  
    return Status{};
  }
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp

index 3382058..cc2fc4a 100644 (file)
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
@@ -31,30 +31,30 @@
  namespace arm_compute
  {
  
-CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
+CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {}
  
-void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
+void CLReduceMax::configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output)
  {
-  _axis = axis;
+  _reduce_axis = reduce_axis;
  
    _input = input;
    _output = output;
  
    auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
-  k->configure(input, axis, output);
+  k->configure(input, reduce_axis, output);
    _kernel = std::move(k);
  
    // We can handle for simple case only
    // Output rank: 1
    // Axis: one axis value, restrict to 1
-  ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
-  ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
-  ARM_COMPUTE_ERROR_ON(axis != 1);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info()));
  }
  
-Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+Status CLReduceMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+                             const ITensorInfo *output)
  {
-  return CLReduceMaxKernel::validate(input, axis, output);
+  return CLReduceMaxKernel::validate(input, reduce_axis, output);
  }
  
  void CLReduceMax::run()
@@ -68,51 +68,129 @@ void CLReduceMax::run()
  #endif
  }
  
-void CLReduceMax::run_on_cpu()
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
  {
-  cl::CommandQueue q = CLScheduler::get().queue();
+  int32_t offset = b * shape[2] * shape[1] * shape[0];
+  offset += d * shape[1] * shape[0];
+  offset += h * shape[0];
+  offset += w;
+  return offset;
+}
  
-  _input->map(q);
-  _output->map(q);
+inline const TensorShape inferOutputShape(const TensorShape &input_shape,
+                                          const std::vector<uint32_t> &reduce_axis)
+{
+  TensorShape out_shape{};
  
-  // Compute by CPU for simple case
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
+  bool keep_axis[4] = {true, true, true, true};
  
-  float *input_data = (float *)_input->buffer();
-  float *output_data = (float *)_output->buffer();
+  for (int i = 0; i < reduce_axis.size(); ++i)
+  {
+    auto axis = reduce_axis[i];
+    keep_axis[axis] = false;
+  }
+
+  for (int i = 0; i < input_shape.num_dimensions(); ++i)
+  {
+    size_t dim = 1;
+    if (keep_axis[i])
+    {
+      dim = input_shape[i];
+    }
+    out_shape.set(i, dim);
+  }
  
-  std::vector<float> container_max;
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  container_max.resize(rows);
+  return out_shape;
+}
  
-  // Initialize as 1st element in row
-  float *input_pointer = input_data;
-  for (int i = 0; i < rows; i++)
+template <typename T>
+inline T getReduceMax(const T *input_data, const TensorShape &input_shape,
+                      const TensorShape &output_shape, const size_t b, const size_t d,
+                      const size_t h, const size_t w)
+{
+  T max_value = input_data[offset4D(input_shape, b, d, h, w)];
+
+  // If output[dimention] == 1, will check all values of that dimension because of reducing
+  // dimension.
+  // Else will check only one value.
+  const size_t start_b = output_shape[3] == 1 ? 0 : b;
+  const size_t start_d = output_shape[2] == 1 ? 0 : d;
+  const size_t start_h = output_shape[1] == 1 ? 0 : h;
+  const size_t start_w = output_shape[0] == 1 ? 0 : w;
+  const size_t stop_b = output_shape[3] == 1 ? input_shape[3] - 1 : b;
+  const size_t stop_d = output_shape[2] == 1 ? input_shape[2] - 1 : d;
+  const size_t stop_h = output_shape[1] == 1 ? input_shape[1] - 1 : h;
+  const size_t stop_w = output_shape[0] == 1 ? input_shape[0] - 1 : w;
+  for (size_t in_b = start_b; in_b <= stop_b; ++in_b)
    {
-    container_max[i] = *input_pointer;
-    input_pointer += cols;
+    for (size_t in_d = start_d; in_d <= stop_d; ++in_d)
+    {
+      for (size_t in_h = start_h; in_h <= stop_h; ++in_h)
+      {
+        for (size_t in_w = start_w; in_w <= stop_w; ++in_w)
+        {
+          max_value =
+              std::max<T>(max_value, input_data[offset4D(input_shape, in_b, in_d, in_h, in_w)]);
+        }
+      }
+    }
    }
  
-  // Update max value in row
-  for (int i = 0; i < rows; i++)
+  return max_value;
+}
+
+template <typename T>
+inline void reduceMax(const T *input_data, const TensorShape &input_shape,
+                      const TensorShape &output_shape, T *output_data)
+{
+  for (size_t out_b = 0; out_b < output_shape[3]; ++out_b)
    {
-    float max_in_row = container_max[i];
-    for (int j = 1; j < cols; j++)
+    for (size_t out_d = 0; out_d < output_shape[2]; ++out_d)
      {
-      if (max_in_row < input_data[i * cols + j])
+      for (size_t out_h = 0; out_h < output_shape[1]; ++out_h)
        {
-        max_in_row = input_data[i * cols + j];
+        for (size_t out_w = 0; out_w < output_shape[0]; ++out_w)
+        {
+          output_data[offset4D(output_shape, out_b, out_d, out_h, out_w)] =
+              getReduceMax(input_data, input_shape, output_shape, out_b, out_d, out_h, out_w);
+        }
        }
      }
-    container_max[i] = max_in_row;
    }
+}
+
+void CLReduceMax::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  _input->map(q);
+  _output->map(q);
+
+  TensorShape input_shape = _input->info()->tensor_shape();
+  TensorShape output_shape = inferOutputShape(input_shape, _reduce_axis);
+
+  // NOTE The param input_dims and output_dims's num_dimensions can be less 4.
+  //      However we should suppose the num_dimensions always are 4 to support up to 4.
+  input_shape.set_num_dimensions(4);
+  output_shape.set_num_dimensions(4);
  
-  for (int i = 0; i < rows; i++)
+  switch (_input->info()->data_type())
    {
-    output_data[i] = container_max[i];
+    case DataType::QASYMM8:
+      reduceMax(reinterpret_cast<const uint8_t *>(_input->buffer()), input_shape, output_shape,
+                reinterpret_cast<uint8_t *>(_output->buffer()));
+      break;
+    case DataType::S32:
+      reduceMax(reinterpret_cast<const int32_t *>(_input->buffer()), input_shape, output_shape,
+                reinterpret_cast<int32_t *>(_output->buffer()));
+      break;
+    case DataType::F32:
+      reduceMax(reinterpret_cast<const float *>(_input->buffer()), input_shape, output_shape,
+                reinterpret_cast<float *>(_output->buffer()));
+      break;
+    defualt:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
    }
  
    _input->unmap(q);
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc

index 093d0a9..60b6e20 100644 (file)
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -2500,43 +2500,76 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
    const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
    const ::internal::tflite::operand::Index axis_index{node.param().axis_index};
  
-  // Handle special case only:
-  //   Input: Matrix (rank 2)
-  //   Output: Vector (rank 1)
-  //   Axis: one element (scalar or rank 1 with 1 element), constant
    auto ifm_shape = _ctx.at(ifm_index).shape();
    auto ofm_shape = _ctx.at(ofm_index).shape();
    auto axis_shape = _ctx.at(axis_index).shape();
-  assert(ofm_shape.rank() == 1);
-  assert(ifm_shape.rank() == 2);
+  assert(ifm_shape.rank() <= 4);
+  assert(ofm_shape.rank() <= ifm_shape.rank());
    assert(_ctx.at(axis_index).hasData());
-  assert(axis_shape.rank() == 0 || ((axis_shape.rank() == 1) && (axis_shape.dim(0) == 1)));
+  assert(axis_shape.rank() == 0 || axis_shape.rank() == 1);
  
+  if (!(ifm_shape.rank() == ofm_shape.rank()))
+  {
+    // The input's rank always is greater or equal to output's rank.
+    // So output's rank have to be extended.
+    const_cast<::internal::tflite::operand::Shape &>(_ctx.at(ofm_index).shape())
+        .extendRank(ifm_shape.rank());
+  }
    _builder.addShapeConstr(ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()),
                                                    _ctx.at(ofm_index).type()));
    _builder.addShapeConstr(ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()),
                                                    _ctx.at(ifm_index).type()));
  
-  // Note: Assume only one element in axis. It is checked by assertion above
-  // TODO: handle general case
-  // Axis is integer value (generally, int32)
-  int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
-  assert(axis_value == 1);
+  std::vector<uint32_t> axis;
+  {
+    const auto ifm_rank = ifm_shape.rank();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const auto axis_size = _ctx.at(axis_index).shape().asVector();
+
+        for (uint32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
+    }
+  }
  
    // Construct operation parameters
    struct Param
    {
      int ofm_index;
      int ifm_index;
-
-    int32_t axis;
+    std::vector<uint32_t> axis;
    };
  
    Param param;
  
    param.ofm_index = ofm_index.asInt();
    param.ifm_index = ifm_index.asInt();
-  param.axis = axis_value;
+  param.axis = axis;
  
    auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
      auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
author	장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
	Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)
committer	오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Tue, 2 Oct 2018 01:06:10 +0000 (10:06 +0900)
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h		patch \| blob \| history
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp		patch \| blob \| history
libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp		patch \| blob \| history
runtimes/pure_arm_compute/src/compilation.cc		patch \| blob \| history