From 2e1befe2394d4f36f5851c8f8f14f8a427773ef0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Prasanna=20R/SNAP=20/SRI-Bangalore/Engineer/=EC=82=BC?= =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Fri, 7 Dec 2018 12:36:03 +0530 Subject: [PATCH] Unify REDUCE_SUM and REDUCE_MAX operation in CL kernel (#3885) This patch unifies REDUCE_SUM and REDUCE_MAX operation in CL kernel This is done to reduce code redundancy. Related issue: #3771 Signed-off-by: prasannar --- libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp | 3 +- .../src/core/CL/cl_kernels/reduce_operation.cl | 68 +++------------------- .../core/CL/kernels/CLReduceOperationKernel.cpp | 4 +- 3 files changed, 12 insertions(+), 63 deletions(-) diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index c62a13b..85b80ca 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -248,8 +248,7 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"}, {"quantization_layer", "quantization_layer.cl"}, {"reduce_min_max", "reduce_operation.cl"}, - {"reduce_mean", "reduce_operation.cl"}, - {"reduce_sum", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, {"remap_nearest_neighbour", "remap.cl"}, {"remap_bilinear", "remap.cl"}, {"reshape_layer", "reshape_layer.cl"}, diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl index f047b51..be3b75a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -75,7 +75,7 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), *((__global DATA_TYPE *)out.ptr) = value; } -/** Perform reduce mean +/** Perform reduce sum/mean * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 @@ -103,64 +103,10 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), * @param[in] axis Axis through which reduction occurs * @param[in] dim Dimension across the axis to be reduced. */ -__kernel void reduce_mean(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; - - DATA_TYPE sum_value = (DATA_TYPE)0; - for(int i = 0; i < dim; ++i) - { - indices[axis] = i; - sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - } - - *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); -} - -/** Perform reduce sum - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] axis Axis through which reduction occurs - * @param[in] dim Dimension across the axis to be reduced. - */ -__kernel void reduce_sum(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int axis, + const int dim) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); @@ -180,6 +126,10 @@ __kernel void reduce_sum(TENSOR4D_DECLARATION(input), sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); } +#if OP_CODE == 2 // REDUCE_SUM *((__global DATA_TYPE *)out.ptr) = sum_value; +#elif OP_CODE == 1 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); +#endif } #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index def0562..e2da6c6 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -98,12 +98,12 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu } else if (op == ReduceOperation::MEAN) { - kernel_name = "reduce_mean"; + kernel_name = "reduce_sum_mean"; op_code = 1; } else if (op == ReduceOperation::SUM) { - kernel_name = "reduce_sum"; + kernel_name = "reduce_sum_mean"; op_code = 2; } else if (op == ReduceOperation::MIN) -- 2.7.4