From 2e1befe2394d4f36f5851c8f8f14f8a427773ef0 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Prasanna=20R/SNAP=20/SRI-Bangalore/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <prasanna.r@samsung.com>
Date: Fri, 7 Dec 2018 12:36:03 +0530
Subject: [PATCH] Unify REDUCE_SUM and REDUCE_MAX operation in CL kernel
 (#3885)

This patch unifies REDUCE_SUM and REDUCE_MAX operation in CL kernel
This is done to reduce code redundancy.
Related issue: #3771

Signed-off-by: prasannar <prasanna.r@samsung.com>
---
 libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp  |  3 +-
 .../src/core/CL/cl_kernels/reduce_operation.cl     | 68 +++-------------------
 .../core/CL/kernels/CLReduceOperationKernel.cpp    |  4 +-
 3 files changed, 12 insertions(+), 63 deletions(-)

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index c62a13b..85b80ca 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -248,8 +248,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
     {"quantization_layer", "quantization_layer.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
-    {"reduce_mean", "reduce_operation.cl"},
-    {"reduce_sum", "reduce_operation.cl"},
+    {"reduce_sum_mean", "reduce_operation.cl"},
     {"remap_nearest_neighbour", "remap.cl"},
     {"remap_bilinear", "remap.cl"},
     {"reshape_layer", "reshape_layer.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index f047b51..be3b75a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -75,7 +75,7 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
     *((__global DATA_TYPE *)out.ptr) = value;
 }
 
-/** Perform reduce mean
+/** Perform reduce sum/mean
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
@@ -103,64 +103,10 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
  * @param[in]  axis                                 Axis through which reduction occurs
  * @param[in]  dim                                  Dimension across the axis to be reduced.
  */
-__kernel void reduce_mean(TENSOR4D_DECLARATION(input),
-                          TENSOR4D_DECLARATION(output),
-                          const int axis,
-                          const int dim)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
-
-    DATA_TYPE sum_value = (DATA_TYPE)0;
-    for(int i = 0; i < dim; ++i)
-    {
-      indices[axis] = i;
-      sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    }
-
-    *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
-}
-
-/** Perform reduce sum
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-__kernel void reduce_sum(TENSOR4D_DECLARATION(input),
-                         TENSOR4D_DECLARATION(output),
-                         const int axis,
-                         const int dim)
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
+                              TENSOR4D_DECLARATION(output),
+                              const int axis,
+                              const int dim)
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
@@ -180,6 +126,10 @@ __kernel void reduce_sum(TENSOR4D_DECLARATION(input),
       sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
     }
 
+#if OP_CODE == 2 // REDUCE_SUM
     *((__global DATA_TYPE *)out.ptr) = sum_value;
+#elif OP_CODE == 1 // REDUCE_MEAN
+    *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+#endif
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index def0562..e2da6c6 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -98,12 +98,12 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
   }
   else if (op == ReduceOperation::MEAN)
   {
-    kernel_name = "reduce_mean";
+    kernel_name = "reduce_sum_mean";
     op_code = 1;
   }
   else if (op == ReduceOperation::SUM)
   {
-    kernel_name = "reduce_sum";
+    kernel_name = "reduce_sum_mean";
     op_code = 2;
   }
   else if (op == ReduceOperation::MIN)
-- 
2.7.4