From: 장지섭/동작제어Lab(SR)/Engineer/삼성전자 Date: Mon, 5 Nov 2018 07:04:39 +0000 (+0900) Subject: Support ReduceMax kernel for cl up to 4-dimensions (#3340) X-Git-Tag: 0.3~476 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4af4ed2b4622ec7c2a842b1510c362be8594ef45;p=platform%2Fcore%2Fml%2Fnnfw.git Support ReduceMax kernel for cl up to 4-dimensions (#3340) * Support ReduceMax kernel for cl up to 4-dimensions This commit supports ReduceMax kernel for cl up to 4-dimensions. Signed-off-by: jiseob.jang * Optimize ReduceMax Kernel for cl This commit optimizes ReduceNMax kernel for cl. - Change calling kernel from at once kernel to call separated kernels multiple times. Signed-off-by: jiseob.jang --- diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h index 8f32ad5..c97202e 100644 --- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h @@ -64,24 +64,23 @@ public: CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default; /** * @brief Initialise the kernel's input, output and border mode. - * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The output tensor, Data types supported: same as @p input. * @param[in] reduce_axis Axis to reduce - * @param[out] output The output tensor, Data types supported: same as @p input1. Note: - * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). * return N/A */ - void configure(const ICLTensor *input, std::vector reduce_axis, ICLTensor *output); + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t reduce_axis); /** * @brief Static function to check if given info will lead to a valid configuration of @ref * CLReduceMaxKernel - * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] reduce_axis Axis to reduce + * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] reduce_axis Axis to reduce * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). * @return a status */ - static Status validate(const ITensorInfo *input, const std::vector &reduce_axis, - const ITensorInfo *output); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t reduce_axis); /* * @brief Run CLReduceMaxKernel op @@ -100,7 +99,7 @@ public: private: const ICLTensor *_input; ICLTensor *_output; - std::vector _reduce_axis; + uint32_t _reduce_axis; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h index 9f36fff..40de94e 100644 --- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h @@ -24,7 +24,9 @@ #ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__ #define __ARM_COMPUTE_CLREDUCE_MAX_H__ +#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" #include "arm_compute/runtime/CL/CLArray.h" +#include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute @@ -66,18 +68,20 @@ public: /** * @brief Initialise the kernel's inputs and outputs. - * @param[in] input Input tensor - * @param[in] axis Axis to reduce - * @param[out] output The result of ReduceMax operation + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] axis Axis to reduce. It must be sorted and no duplicates. + * @param[out] output The result of ReduceMax operation. Data types supported: same as @p + * input. * @return N/A */ void configure(ICLTensor *input, std::vector reduce_axis, ICLTensor *output); /** * @brief Static function to check if given info will lead to a valid configuration - * @param[in] input Input tensor + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. * @param[in] axis Axis to reduce - * @param[out] output The result of ReduceMax operation + * @param[out] output The result of ReduceMax operation. Data types supported: same as @p + * input. * @return a status */ static Status validate(const ITensorInfo *input, const std::vector &reduce_axis, @@ -100,7 +104,9 @@ private: ICLTensor *_output; std::vector _reduce_axis; - std::unique_ptr _kernel; + std::unique_ptr _interm_tensors{nullptr}; + std::unique_ptr _reduction_kernels{nullptr}; + size_t _num_of_kernels; }; } #endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl index 21fc304..ca7d315 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl @@ -16,37 +16,59 @@ */ #include "helpers.h" -#if defined(WIDTH) +#if defined(DATA_TYPE) && defined(DEPTH_OUT) /** Perform reduce max * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] stops Index of the last input element for each axis + * If value is -1, that means the axis is reduced */ -__kernel void reduce_max(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(output)) +__kernel void reduce_max(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + const int4 stops) { - Vector input = CONVERT_TO_VECTOR_STRUCT(input); - Vector output = CONVERT_TO_VECTOR_STRUCT(output); + // TODO Do not use for loop + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - __global float *input_addr = (__global float *)(input.ptr); - __global float *output_addr = (__global float *)(output.ptr); + DATA_TYPE max_value = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, get_global_id(2) / DEPTH_OUT)); - float max_value = *input_addr; - for(int x = 1; x < WIDTH; x++) + for(int x = (stops.x == -1 ? get_global_id(0) : 0); x <= (stops.x == -1 ? get_global_id(0) : stops.x); ++x) { - float value = *(input_addr + x); - max_value = max(value, max_value); + for(int y = (stops.y == -1 ? get_global_id(1) : 0); y <= (stops.y == -1 ? get_global_id(1) : stops.y); ++y) + { + for(int z = (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : 0); z <= (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : stops.z); ++z) + { + for(int w = (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : 0); w <= (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : stops.w); ++w) + { + max_value = max(max_value, *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, z, w))); + } + } + } } - // Store max - *output_addr = max_value; + *((__global DATA_TYPE *)out.ptr) = max_value; } -#endif // defined(WIDTH) +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp index 4fa08e6..bae9475 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp @@ -24,41 +24,38 @@ using namespace arm_compute; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t reduce_axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(reduce_axis, 1); + + return out_shape; +} +} // namespace -Status validate_arguments(const ITensorInfo *input, const std::vector &reduce_axis, - const ITensorInfo *output) +namespace { - // We can handle for simple case only - // Input rank: 2 - // Output rank: 1 - // Axis: one axis value, restrict to 1 +constexpr unsigned int num_elems_processed_per_iteration = 16; - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t reduce_axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32, + DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, "Inputs are not broadcast compatible"); - // Validate in case of configured output - if (output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), - "Output same type allowed for input and output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, - "Only support for output dimension 1"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, - "Only support for input dimension 2"); - } + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), reduce_axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match reduce_axis"); const auto num_dimensions = input->tensor_shape().num_dimensions(); - for (size_t i = 0; i < reduce_axis.size(); ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions, - "reduce_axis must be greater than or equal to 0 and less than (input's rank)."); - } - + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + reduce_axis >= 0 && reduce_axis < num_dimensions, + "reduce_axis must be greater than or equal to 0 and less than (input's rank)."); return Status{}; } @@ -66,43 +63,46 @@ Status validate_arguments(const ITensorInfo *input, const std::vector CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {} -void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector reduce_axis, - ICLTensor *output) +void CLReduceMaxKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t reduce_axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), reduce_axis)); _input = input; _output = output; _reduce_axis = reduce_axis; - // Configure kernel window - int cols = _input->info()->tensor_shape()[0]; - int rows = _input->info()->tensor_shape()[1]; - Window win; - win.set(0, Window::Dimension(0, cols, 1)); - win.set(1, Window::Dimension(0, rows, 1)); + std::unique_ptr output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), reduce_axis)); // Construct kernel name std::string kernel_name = "reduce_max"; // Set kernel build options std::set build_opts; - build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); // Create kernel _kernel = static_cast(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + ICLKernel::configure(win); } -Status CLReduceMaxKernel::validate(const ITensorInfo *input, - const std::vector &reduce_axis, - const ITensorInfo *output) +Status CLReduceMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t reduce_axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, reduce_axis)); return Status{}; } @@ -112,16 +112,41 @@ void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window window_input = window; - Window slice_input = window_input.first_slice_window_1D(); + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + // Initialize as -1 that means the axis is not reduced + cl_int4 stops = {{ + static_cast(-1), static_cast(-1), static_cast(-1), + static_cast(-1), + }}; + + stops.s[_reduce_axis] = static_cast(shape_in[_reduce_axis] - 1); + + _kernel.setArg(idx++, stops); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _reduce_axis)); do { - Window slice_output = slice_input.shift_dimensions(1); unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, slice_input); - add_1D_tensor_argument(idx, _output, slice_output); - enqueue(queue, *this, slice_input); + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); - } while (window_input.slide_window_slice_1D(slice_input)); + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); } diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp index 1a6f5e9..cdaa5c9 100644 --- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp @@ -18,36 +18,97 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" - -#define REDUCE_MAX_RUN_ON_CPU 1 +#include "arm_compute/core/TensorInfo.h" namespace arm_compute { -CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {} +CLReduceMax::CLReduceMax() + : _input(nullptr), _output(nullptr), _reduce_axis(), _interm_tensors(), _reduction_kernels(), + _num_of_kernels() +{ +} void CLReduceMax::configure(ICLTensor *input, std::vector reduce_axis, ICLTensor *output) { + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), reduce_axis, output->info())); + _reduce_axis = reduce_axis; _input = input; _output = output; - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, reduce_axis, output); - _kernel = std::move(k); + // NOTE The reduce_axis must have no duplication. + _num_of_kernels = reduce_axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique(num_of_interm_tensors); + _reduction_kernels = + arm_compute::support::cpp14::make_unique(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(reduce_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); - // We can handle for simple case only - // Output rank: 1 - // Axis: one axis value, restrict to 1 - ARM_COMPUTE_ERROR_THROW_ON( - CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info())); + // Apply ReduceMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _reduction_kernels[i].configure(tensors[i], tensors[i + 1], reduce_axis[i]); + } } Status CLReduceMax::validate(const ITensorInfo *input, const std::vector &reduce_axis, const ITensorInfo *output) { - return CLReduceMaxKernel::validate(input, reduce_axis, output); + const size_t num_of_kernels = reduce_axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(reduce_axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceMax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceMaxKernel::validate(tensors[i], tensors[i + 1], reduce_axis[i])); + } + + return Status{}; } void CLReduceMax::run() @@ -57,7 +118,10 @@ void CLReduceMax::run() arm_compute::CLScheduler::get().sync(); #else - arm_compute::CLScheduler::get().enqueue(*_kernel); + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduction_kernels[i]); + } #endif } diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index 5b627ab..afbd8e7 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -2632,6 +2632,9 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node) throw std::runtime_error("Not supported"); break; } + std::sort(axis.begin(), axis.end()); + auto last = std::unique(axis.begin(), axis.end()); + axis.erase(last, axis.end()); } // Construct operation parameters