From af76fbca19f76449164406f7ff2e69dc8e6a5c88 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9E=A5=EC=A7=80=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <jiseob.jang@samsung.com>
Date: Mon, 26 Nov 2018 14:43:58 +0900
Subject: [PATCH] Apply CL Kernel of ReduceOperation to PACL. (#3673)

This commit applies CL Kernel of ReduceOperation to PACL.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---
 libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp |   9 +-
 runtimes/pure_arm_compute/src/compilation.cc      | 113 +++++++++++-----------
 2 files changed, 65 insertions(+), 57 deletions(-)

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 3c28739..786ed31 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -244,8 +244,9 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
     {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
     {"quantization_layer", "quantization_layer.cl"},
-    {"reduce_max", "reduce_max.cl"},
-    {"reduce_sum", "reduce_sum.cl"},
+    {"reduce_max", "reduce_operation.cl"},
+    {"reduce_mean", "reduce_operation.cl"},
+    {"reduce_sum", "reduce_operation.cl"},
     {"reduction_operation", "reduction_operation.cl"},
     {"reduction_mean", "reduction_mean.cl"},
     {"remap_nearest_neighbour", "remap.cl"},
@@ -392,6 +393,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/pixelwise_div_int.clembed"
     },
     {
+        "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+    },
+    {
         "reduce_max.cl",
 #include "./cl_kernels/reduce_max.clembed"
     },
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index c736a93..8f9cf58 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -44,15 +44,13 @@
 #include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
 #include <arm_compute/runtime/CL/functions/CLGather.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
-#include <arm_compute/runtime/CL/functions/CLReduceMax.h>
-#include <arm_compute/runtime/CL/functions/CLReduceSum.h>
 #include <arm_compute/runtime/CL/functions/CLCast.h>
 #include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h>
 #include <arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h>
 #include <arm_compute/runtime/CL/functions/CLDequantizationLayer.h>
 #include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
-#include <arm_compute/runtime/CL/functions/CLReductionMean.h>
 #include <arm_compute/runtime/CL/functions/CLPermuteEx.h>
+#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
 #include <arm_compute/runtime/CL/functions/CLRNNLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFloor.h>
 #include <arm_compute/runtime/CL/functions/CLCopy.h>
@@ -2602,7 +2600,7 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
       ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
                               _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
 
-  std::vector<uint32_t> axis;
+  std::set<uint32_t> axis;
   {
     const auto ifm_rank = ifm_shape.rank();
     switch (axis_shape.rank())
@@ -2614,7 +2612,7 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
         {
           axis_value += ifm_rank;
         }
-        axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+        axis.insert(ToARMComputeAxis(ifm_rank, axis_value).value());
         break;
       }
       case 1: // vector
@@ -2632,7 +2630,7 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
           {
             axis_value += ifm_rank;
           }
-          axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+          axis.insert(ToARMComputeAxis(ifm_rank, axis_value).value());
         }
         break;
       }
@@ -2640,9 +2638,6 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
         throw std::runtime_error("Not supported");
         break;
     }
-    std::sort(axis.begin(), axis.end());
-    auto last = std::unique(axis.begin(), axis.end());
-    axis.erase(last, axis.end());
   }
 
   // Construct operation parameters
@@ -2650,7 +2645,7 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
   {
     int ofm_index;
     int ifm_index;
-    std::vector<uint32_t> axis;
+    std::set<uint32_t> axis;
   };
 
   Param param;
@@ -2665,9 +2660,10 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
 
     if (::internal::arm_compute::isGpuMode())
     {
-      auto fn = nnfw::make_unique<::arm_compute::CLReduceMax>();
+      auto fn = nnfw::make_unique<::arm_compute::CLReduceOperation>();
 
-      fn->configure(CAST_CL(ifm_alloc), param.axis, CAST_CL(ofm_alloc));
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis,
+                    ::arm_compute::ReduceOperation::MAX);
 
       builder.append("ReduceMax", std::move(fn));
     }
@@ -3195,47 +3191,44 @@ void Planner::visit(const ::internal::tflite::op::Mean::Node &node)
                                        _ctx.at(axis_index).type(), _ctx.at(axis_index).scale(),
                                        _ctx.at(axis_index).zeroPoint()));
 
-  // TODO keep_dims==0
-  assert(keep_dims != 0);
-
-  // Set axis
-  // TODO Other axis (Axis for width and height are currently supported.)
-  // TODO Other ranks (Rank 4 is currently supported.)
-  assert(_ctx.at(ifm_index).shape().rank() == 4);
-
-  std::vector<uint32_t> axis;
+  std::set<uint32_t> axis;
   {
-    const auto axis_base = _ctx.at(axis_index).data().base();
-    const auto axis_type = _ctx.at(axis_index).type();
-    const auto axis_size = _ctx.at(axis_index).shape().asVector();
-
-    // If axis's data does not exist as constant values and can be gotten as input data, we have
-    // to find a way to infer output shape when sinking output.
-    assert(axis_base != nullptr);
-    // NHWC type -> WHCN type
-    if (_ctx.at(ofm_index).shape().rank() == 4)
+    const auto ifm_rank = ifm_shape.rank();
+    const auto axis_shape = _ctx.at(axis_index).shape();
+    switch (axis_shape.rank())
     {
-      for (uint32_t n = 0; n < axis_size; ++n)
+      case 0: // scalar
       {
-        const ::arm_compute::Coordinates coordinate{n};
-        const int32_t *from = reinterpret_cast<const int32_t *>(axis_base) + n;
-        if (*from == 1)
-        {
-          axis.push_back(1); // h
-        }
-        else if (*from == 2)
-        {
-          axis.push_back(0); // w
-        }
-        else if (*from < 0)
+        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
         {
-          // Nothing to do
+          axis_value += ifm_rank;
         }
-        else
+        axis.insert(ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const auto axis_size = _ctx.at(axis_index).shape().asVector();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (uint32_t n = 0; n < axis_size; ++n)
         {
-          throw std::runtime_error{"Not supported axis"};
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.insert(ToARMComputeAxis(ifm_rank, axis_value).value());
         }
+        break;
       }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
     }
   }
 
@@ -3243,7 +3236,7 @@ void Planner::visit(const ::internal::tflite::op::Mean::Node &node)
   {
     int ofm_index;
     int ifm_index;
-    std::vector<uint32_t> axis;
+    std::set<uint32_t> axis;
   };
 
   Param param;
@@ -3258,9 +3251,10 @@ void Planner::visit(const ::internal::tflite::op::Mean::Node &node)
 
     if (::internal::arm_compute::isGpuMode())
     {
-      auto fn = nnfw::make_unique<::arm_compute::CLReductionMean>();
+      auto fn = nnfw::make_unique<::arm_compute::CLReduceOperation>();
 
-      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis,
+                    ::arm_compute::ReduceOperation::MEAN);
 
       builder.append("Mean", std::move(fn));
     }
@@ -5073,13 +5067,17 @@ void Planner::visit(const ::internal::tflite::op::ReduceSum::Node &node)
                               _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
 
   uint32_t input_rank = ifm_shape.rank();
-  std::vector<uint32_t> axis;
+  std::set<uint32_t> axis;
   int32_t axis_rank = axis_shape.rank();
 
   if (axis_rank == 0)
   {
     int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
-    axis.push_back(ToARMComputeAxis(input_rank, axis_value).value());
+    if (axis_value < 0)
+    {
+      axis_value += input_rank;
+    }
+    axis.insert(ToARMComputeAxis(input_rank, axis_value).value());
   }
   else if (axis_rank == 1)
   {
@@ -5091,8 +5089,12 @@ void Planner::visit(const ::internal::tflite::op::ReduceSum::Node &node)
     assert(axis_base != nullptr);
     for (uint32_t n = 0; n < axis_size; ++n)
     {
-      const int32_t *from = reinterpret_cast<const int32_t *>(axis_base) + n;
-      axis.push_back(ToARMComputeAxis(input_rank, *from).value());
+      int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+      if (axis_value < 0)
+      {
+        axis_value += input_rank;
+      }
+      axis.insert(ToARMComputeAxis(input_rank, axis_value).value());
     }
   }
   else
@@ -5104,7 +5106,7 @@ void Planner::visit(const ::internal::tflite::op::ReduceSum::Node &node)
   {
     int ofm_index;
     int ifm_index;
-    std::vector<uint32_t> axis;
+    std::set<uint32_t> axis;
   };
 
   Param param;
@@ -5119,9 +5121,10 @@ void Planner::visit(const ::internal::tflite::op::ReduceSum::Node &node)
 
     if (::internal::arm_compute::isGpuMode())
     {
-      auto fn = nnfw::make_unique<::arm_compute::CLReduceSum>();
+      auto fn = nnfw::make_unique<::arm_compute::CLReduceOperation>();
 
-      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis,
+                    ::arm_compute::ReduceOperation::SUM);
 
       builder.append("ReduceSum", std::move(fn));
     }
-- 
2.7.4