Support ReduceMax kernel for cl up to 4-dimensions (#3340)

author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>

Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)

committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)
author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)
committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h

index 8f32ad5..c97202e 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
@@ -64,24 +64,23 @@ public:
    CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
    /**
     * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output         The output tensor, Data types supported: same as @p input.
     * @param[in]  reduce_axis    Axis to reduce
-   * @param[out] output         The output tensor, Data types supported: same as @p input1. Note:
-   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
     * return N/A
     */
-  void configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t reduce_axis);
    /**
     * @brief Static function to check if given info will lead to a valid configuration of @ref
     * CLReduceMaxKernel
-   * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] reduce_axis     Axis to reduce
+   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
     * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * @param[in] reduce_axis     Axis to reduce
     * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
     * @return a status
     */
-  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
-                         const ITensorInfo *output);
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const uint32_t reduce_axis);
  
    /*
     * @brief Run CLReduceMaxKernel op
@@ -100,7 +99,7 @@ public:
  private:
    const ICLTensor *_input;
    ICLTensor *_output;
-  std::vector<uint32_t> _reduce_axis;
+  uint32_t _reduce_axis;
  };
  } // namespace arm_compute
  #endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h

index 9f36fff..40de94e 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
@@ -24,7 +24,9 @@
  #ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
  #define __ARM_COMPUTE_CLREDUCE_MAX_H__
  
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
  #include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
  #include "arm_compute/runtime/IFunction.h"
  
  namespace arm_compute
@@ -66,18 +68,20 @@ public:
  
    /**
     * @brief Initialise the kernel's inputs and outputs.
-   * @param[in]  input     Input tensor
-   * @param[in]  axis      Axis to reduce
-   * @param[out] output    The result of ReduceMax operation
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  axis      Axis to reduce. It must be sorted and no duplicates.
+   * @param[out] output    The result of ReduceMax operation. Data types supported: same as @p
+   * input.
     * @return N/A
     */
    void configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
  
    /**
     * @brief Static function to check if given info will lead to a valid configuration
-   * @param[in]  input     Input tensor
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
     * @param[in]  axis      Axis to reduce
-   * @param[out] output    The result of ReduceMax operation
+   * @param[out] output    The result of ReduceMax operation. Data types supported: same as @p
+   * input.
     * @return a status
     */
    static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
@@ -100,7 +104,9 @@ private:
    ICLTensor *_output;
    std::vector<uint32_t> _reduce_axis;
  
-  std::unique_ptr<ICLKernel> _kernel;
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceMaxKernel[]> _reduction_kernels{nullptr};
+  size_t _num_of_kernels;
  };
  }
  #endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl

index 21fc304..ca7d315 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
@@ -16,37 +16,59 @@
   */
  #include "helpers.h"
  
-#if defined(WIDTH)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT)
  /** Perform reduce max
   *
   * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
   *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
   * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  stops                                Index of the last input element for each axis
+ *                                                  If value is -1, that means the axis is reduced
   */
-__kernel void reduce_max(VECTOR_DECLARATION(input),
-                         VECTOR_DECLARATION(output))
+__kernel void reduce_max(TENSOR4D_DECLARATION(input),
+                         TENSOR4D_DECLARATION(output),
+                         const int4 stops)
  {
-    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+    // TODO Do not use for loop
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
  
-    __global float *input_addr = (__global float *)(input.ptr);
-    __global float *output_addr = (__global float *)(output.ptr);
+    DATA_TYPE max_value = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, get_global_id(2) / DEPTH_OUT));
  
-    float max_value = *input_addr;
-    for(int x = 1; x < WIDTH; x++)
+    for(int x = (stops.x == -1 ? get_global_id(0) : 0); x <= (stops.x == -1 ? get_global_id(0) : stops.x); ++x)
      {
-        float value = *(input_addr + x);
-        max_value = max(value, max_value);
+        for(int y = (stops.y == -1 ? get_global_id(1) : 0); y <= (stops.y == -1 ? get_global_id(1) : stops.y); ++y)
+        {
+            for(int z = (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : 0); z <= (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : stops.z); ++z)
+            {
+                for(int w = (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : 0); w <= (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : stops.w); ++w)
+                {
+                    max_value = max(max_value, *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, z, w)));
+                }
+            }
+        }
      }
  
-    // Store max
-    *output_addr = max_value;
+    *((__global DATA_TYPE *)out.ptr) = max_value;
  }
-#endif // defined(WIDTH)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp

index 4fa08e6..bae9475 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
@@ -24,41 +24,38 @@ using namespace arm_compute;
  
  namespace
  {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t reduce_axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(reduce_axis, 1);
+
+  return out_shape;
+}
+} // namespace
  
-Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
-                          const ITensorInfo *output)
+namespace
  {
-  // We can handle for simple case only
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
+constexpr unsigned int num_elems_processed_per_iteration = 16;
  
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const uint32_t reduce_axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+                                                       DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
  
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
                                    "Inputs are not broadcast compatible");
  
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
-                                    "Output same type allowed for input and output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
-                                    "Only support for output dimension 1");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
-                                    "Only support for input dimension 2");
-  }
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), reduce_axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match reduce_axis");
  
    const auto num_dimensions = input->tensor_shape().num_dimensions();
-  for (size_t i = 0; i < reduce_axis.size(); ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions,
-        "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
-  }
-
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      reduce_axis >= 0 && reduce_axis < num_dimensions,
+      "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
    return Status{};
  }
  
@@ -66,43 +63,46 @@ Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t>
  
  CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {}
  
-void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis,
-                                  ICLTensor *output)
+void CLReduceMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const uint32_t reduce_axis)
  {
    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info()));
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), reduce_axis));
  
    _input = input;
    _output = output;
    _reduce_axis = reduce_axis;
  
-  // Configure kernel window
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  Window win;
-  win.set(0, Window::Dimension(0, cols, 1));
-  win.set(1, Window::Dimension(0, rows, 1));
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), reduce_axis));
  
    // Construct kernel name
    std::string kernel_name = "reduce_max";
  
    // Set kernel build options
    std::set<std::string> build_opts;
-  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
  
    // Create kernel
    _kernel =
        static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
  
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
    ICLKernel::configure(win);
  }
  
-Status CLReduceMaxKernel::validate(const ITensorInfo *input,
-                                   const std::vector<uint32_t> &reduce_axis,
-                                   const ITensorInfo *output)
+Status CLReduceMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const uint32_t reduce_axis)
  {
    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, reduce_axis));
  
    return Status{};
  }
@@ -112,16 +112,41 @@ void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
  
-  Window window_input = window;
-  Window slice_input = window_input.first_slice_window_1D();
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  // Initialize as -1 that means the axis is not reduced
+  cl_int4 stops = {{
+      static_cast<cl_int>(-1), static_cast<cl_int>(-1), static_cast<cl_int>(-1),
+      static_cast<cl_int>(-1),
+  }};
+
+  stops.s[_reduce_axis] = static_cast<cl_int>(shape_in[_reduce_axis] - 1);
+
+  _kernel.setArg<cl_int4>(idx++, stops);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _reduce_axis));
  
    do
    {
-    Window slice_output = slice_input.shift_dimensions(1);
      unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input, slice_input);
-    add_1D_tensor_argument(idx, _output, slice_output);
-    enqueue(queue, *this, slice_input);
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
  
-  } while (window_input.slide_window_slice_1D(slice_input));
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
  }
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp

index 1a6f5e9..cdaa5c9 100644 (file)
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
@@ -18,36 +18,97 @@
  
  #include "arm_compute/core/CL/ICLTensor.h"
  #include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#define REDUCE_MAX_RUN_ON_CPU 1
+#include "arm_compute/core/TensorInfo.h"
  
  namespace arm_compute
  {
  
-CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {}
+CLReduceMax::CLReduceMax()
+    : _input(nullptr), _output(nullptr), _reduce_axis(), _interm_tensors(), _reduction_kernels(),
+      _num_of_kernels()
+{
+}
  
  void CLReduceMax::configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output)
  {
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), reduce_axis, output->info()));
+
    _reduce_axis = reduce_axis;
  
    _input = input;
    _output = output;
  
-  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
-  k->configure(input, reduce_axis, output);
-  _kernel = std::move(k);
+  // NOTE The reduce_axis must have no duplication.
+  _num_of_kernels = reduce_axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduction_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceMaxKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(reduce_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
  
-  // We can handle for simple case only
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-  ARM_COMPUTE_ERROR_THROW_ON(
-      CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info()));
+  // Apply ReduceMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _reduction_kernels[i].configure(tensors[i], tensors[i + 1], reduce_axis[i]);
+  }
  }
  
  Status CLReduceMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
                               const ITensorInfo *output)
  {
-  return CLReduceMaxKernel::validate(input, reduce_axis, output);
+  const size_t num_of_kernels = reduce_axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(reduce_axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceMax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceMaxKernel::validate(tensors[i], tensors[i + 1], reduce_axis[i]));
+  }
+
+  return Status{};
  }
  
  void CLReduceMax::run()
@@ -57,7 +118,10 @@ void CLReduceMax::run()
  
    arm_compute::CLScheduler::get().sync();
  #else
-  arm_compute::CLScheduler::get().enqueue(*_kernel);
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduction_kernels[i]);
+  }
  #endif
  }
  
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc

index 5b627ab..afbd8e7 100644 (file)
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -2632,6 +2632,9 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
          throw std::runtime_error("Not supported");
          break;
      }
+    std::sort(axis.begin(), axis.end());
+    auto last = std::unique(axis.begin(), axis.end());
+    axis.erase(last, axis.end());
    }
  
    // Construct operation parameters
author	장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
	Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)
committer	오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Mon, 5 Nov 2018 07:04:39 +0000 (16:04 +0900)
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h		patch \| blob \| history
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp		patch \| blob \| history
libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp		patch \| blob \| history
runtimes/pure_arm_compute/src/compilation.cc		patch \| blob \| history