From: 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Date: Mon, 5 Nov 2018 07:04:39 +0000 (+0900)
Subject: Support ReduceMax kernel for cl up to 4-dimensions (#3340)
X-Git-Tag: 0.3~476
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4af4ed2b4622ec7c2a842b1510c362be8594ef45;p=platform%2Fcore%2Fml%2Fnnfw.git

Support ReduceMax kernel for cl up to 4-dimensions (#3340)

* Support ReduceMax kernel for cl up to 4-dimensions

This commit supports ReduceMax kernel for cl up to 4-dimensions.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>

* Optimize ReduceMax Kernel for cl

This commit optimizes ReduceNMax kernel for cl.
  - Change calling kernel from at once kernel to call separated kernels multiple times.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
index 8f32ad5..c97202e 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
@@ -64,24 +64,23 @@ public:
   CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
   /**
    * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output         The output tensor, Data types supported: same as @p input.
    * @param[in]  reduce_axis    Axis to reduce
-   * @param[out] output         The output tensor, Data types supported: same as @p input1. Note:
-   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
    * return N/A
    */
-  void configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t reduce_axis);
   /**
    * @brief Static function to check if given info will lead to a valid configuration of @ref
    * CLReduceMaxKernel
-   * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] reduce_axis     Axis to reduce
+   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
    * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * @param[in] reduce_axis     Axis to reduce
    * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
    * @return a status
    */
-  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
-                         const ITensorInfo *output);
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const uint32_t reduce_axis);
 
   /*
    * @brief Run CLReduceMaxKernel op
@@ -100,7 +99,7 @@ public:
 private:
   const ICLTensor *_input;
   ICLTensor *_output;
-  std::vector<uint32_t> _reduce_axis;
+  uint32_t _reduce_axis;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
index 9f36fff..40de94e 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
@@ -24,7 +24,9 @@
 #ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
 #define __ARM_COMPUTE_CLREDUCE_MAX_H__
 
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
 #include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
@@ -66,18 +68,20 @@ public:
 
   /**
    * @brief Initialise the kernel's inputs and outputs.
-   * @param[in]  input     Input tensor
-   * @param[in]  axis      Axis to reduce
-   * @param[out] output    The result of ReduceMax operation
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  axis      Axis to reduce. It must be sorted and no duplicates.
+   * @param[out] output    The result of ReduceMax operation. Data types supported: same as @p
+   * input.
    * @return N/A
    */
   void configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
 
   /**
    * @brief Static function to check if given info will lead to a valid configuration
-   * @param[in]  input     Input tensor
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
    * @param[in]  axis      Axis to reduce
-   * @param[out] output    The result of ReduceMax operation
+   * @param[out] output    The result of ReduceMax operation. Data types supported: same as @p
+   * input.
    * @return a status
    */
   static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
@@ -100,7 +104,9 @@ private:
   ICLTensor *_output;
   std::vector<uint32_t> _reduce_axis;
 
-  std::unique_ptr<ICLKernel> _kernel;
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceMaxKernel[]> _reduction_kernels{nullptr};
+  size_t _num_of_kernels;
 };
 }
 #endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
index 21fc304..ca7d315 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
@@ -16,37 +16,59 @@
  */
 #include "helpers.h"
 
-#if defined(WIDTH)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT)
 /** Perform reduce max
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
  *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  stops                                Index of the last input element for each axis
+ *                                                  If value is -1, that means the axis is reduced
  */
-__kernel void reduce_max(VECTOR_DECLARATION(input),
-                         VECTOR_DECLARATION(output))
+__kernel void reduce_max(TENSOR4D_DECLARATION(input),
+                         TENSOR4D_DECLARATION(output),
+                         const int4 stops)
 {
-    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+    // TODO Do not use for loop
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
-    __global float *input_addr = (__global float *)(input.ptr);
-    __global float *output_addr = (__global float *)(output.ptr);
+    DATA_TYPE max_value = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, get_global_id(2) / DEPTH_OUT));
 
-    float max_value = *input_addr;
-    for(int x = 1; x < WIDTH; x++)
+    for(int x = (stops.x == -1 ? get_global_id(0) : 0); x <= (stops.x == -1 ? get_global_id(0) : stops.x); ++x)
     {
-        float value = *(input_addr + x);
-        max_value = max(value, max_value);
+        for(int y = (stops.y == -1 ? get_global_id(1) : 0); y <= (stops.y == -1 ? get_global_id(1) : stops.y); ++y)
+        {
+            for(int z = (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : 0); z <= (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : stops.z); ++z)
+            {
+                for(int w = (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : 0); w <= (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : stops.w); ++w)
+                {
+                    max_value = max(max_value, *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, z, w)));
+                }
+            }
+        }
     }
 
-    // Store max
-    *output_addr = max_value;
+    *((__global DATA_TYPE *)out.ptr) = max_value;
 }
-#endif // defined(WIDTH)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
index 4fa08e6..bae9475 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
@@ -24,41 +24,38 @@ using namespace arm_compute;
 
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t reduce_axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(reduce_axis, 1);
+
+  return out_shape;
+}
+} // namespace
 
-Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
-                          const ITensorInfo *output)
+namespace
 {
-  // We can handle for simple case only
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
+constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const uint32_t reduce_axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+                                                       DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
                                   "Inputs are not broadcast compatible");
 
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
-                                    "Output same type allowed for input and output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
-                                    "Only support for output dimension 1");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
-                                    "Only support for input dimension 2");
-  }
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), reduce_axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match reduce_axis");
 
   const auto num_dimensions = input->tensor_shape().num_dimensions();
-  for (size_t i = 0; i < reduce_axis.size(); ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions,
-        "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
-  }
-
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      reduce_axis >= 0 && reduce_axis < num_dimensions,
+      "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
   return Status{};
 }
 
@@ -66,43 +63,46 @@ Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t>
 
 CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {}
 
-void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis,
-                                  ICLTensor *output)
+void CLReduceMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const uint32_t reduce_axis)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info()));
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), reduce_axis));
 
   _input = input;
   _output = output;
   _reduce_axis = reduce_axis;
 
-  // Configure kernel window
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  Window win;
-  win.set(0, Window::Dimension(0, cols, 1));
-  win.set(1, Window::Dimension(0, rows, 1));
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), reduce_axis));
 
   // Construct kernel name
   std::string kernel_name = "reduce_max";
 
   // Set kernel build options
   std::set<std::string> build_opts;
-  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
 
   // Create kernel
   _kernel =
       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
   ICLKernel::configure(win);
 }
 
-Status CLReduceMaxKernel::validate(const ITensorInfo *input,
-                                   const std::vector<uint32_t> &reduce_axis,
-                                   const ITensorInfo *output)
+Status CLReduceMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const uint32_t reduce_axis)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, reduce_axis));
 
   return Status{};
 }
@@ -112,16 +112,41 @@ void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-  Window window_input = window;
-  Window slice_input = window_input.first_slice_window_1D();
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  // Initialize as -1 that means the axis is not reduced
+  cl_int4 stops = {{
+      static_cast<cl_int>(-1), static_cast<cl_int>(-1), static_cast<cl_int>(-1),
+      static_cast<cl_int>(-1),
+  }};
+
+  stops.s[_reduce_axis] = static_cast<cl_int>(shape_in[_reduce_axis] - 1);
+
+  _kernel.setArg<cl_int4>(idx++, stops);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _reduce_axis));
 
   do
   {
-    Window slice_output = slice_input.shift_dimensions(1);
     unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input, slice_input);
-    add_1D_tensor_argument(idx, _output, slice_output);
-    enqueue(queue, *this, slice_input);
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 
-  } while (window_input.slide_window_slice_1D(slice_input));
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
 }
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
index 1a6f5e9..cdaa5c9 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
@@ -18,36 +18,97 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#define REDUCE_MAX_RUN_ON_CPU 1
+#include "arm_compute/core/TensorInfo.h"
 
 namespace arm_compute
 {
 
-CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {}
+CLReduceMax::CLReduceMax()
+    : _input(nullptr), _output(nullptr), _reduce_axis(), _interm_tensors(), _reduction_kernels(),
+      _num_of_kernels()
+{
+}
 
 void CLReduceMax::configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output)
 {
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), reduce_axis, output->info()));
+
   _reduce_axis = reduce_axis;
 
   _input = input;
   _output = output;
 
-  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
-  k->configure(input, reduce_axis, output);
-  _kernel = std::move(k);
+  // NOTE The reduce_axis must have no duplication.
+  _num_of_kernels = reduce_axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduction_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceMaxKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(reduce_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
 
-  // We can handle for simple case only
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-  ARM_COMPUTE_ERROR_THROW_ON(
-      CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info()));
+  // Apply ReduceMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _reduction_kernels[i].configure(tensors[i], tensors[i + 1], reduce_axis[i]);
+  }
 }
 
 Status CLReduceMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
                              const ITensorInfo *output)
 {
-  return CLReduceMaxKernel::validate(input, reduce_axis, output);
+  const size_t num_of_kernels = reduce_axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(reduce_axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceMax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceMaxKernel::validate(tensors[i], tensors[i + 1], reduce_axis[i]));
+  }
+
+  return Status{};
 }
 
 void CLReduceMax::run()
@@ -57,7 +118,10 @@ void CLReduceMax::run()
 
   arm_compute::CLScheduler::get().sync();
 #else
-  arm_compute::CLScheduler::get().enqueue(*_kernel);
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduction_kernels[i]);
+  }
 #endif
 }
 
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 5b627ab..afbd8e7 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -2632,6 +2632,9 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
         throw std::runtime_error("Not supported");
         break;
     }
+    std::sort(axis.begin(), axis.end());
+    auto last = std::unique(axis.begin(), axis.end());
+    axis.erase(last, axis.end());
   }
 
   // Construct operation parameters