CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
/**
* @brief Initialise the kernel's input, output and border mode.
- * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[out] output The output tensor, Data types supported: same as @p input.
* @param[in] reduce_axis Axis to reduce
- * @param[out] output The output tensor, Data types supported: same as @p input1. Note:
- * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
* return N/A
*/
- void configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
+ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t reduce_axis);
/**
* @brief Static function to check if given info will lead to a valid configuration of @ref
* CLReduceMaxKernel
- * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] reduce_axis Axis to reduce
+ * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
* @param[in] output The output tensor info, Data types supported: same as @p input1.
+ * @param[in] reduce_axis Axis to reduce
* Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
* @return a status
*/
- static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
- const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t reduce_axis);
/*
* @brief Run CLReduceMaxKernel op
private:
const ICLTensor *_input;
ICLTensor *_output;
- std::vector<uint32_t> _reduce_axis;
+ uint32_t _reduce_axis;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
#define __ARM_COMPUTE_CLREDUCE_MAX_H__
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
namespace arm_compute
/**
* @brief Initialise the kernel's inputs and outputs.
- * @param[in] input Input tensor
- * @param[in] axis Axis to reduce
- * @param[out] output The result of ReduceMax operation
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+ * @param[in] axis Axis to reduce. It must be sorted and no duplicates.
+ * @param[out] output The result of ReduceMax operation. Data types supported: same as @p
+ * input.
* @return N/A
*/
void configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
/**
* @brief Static function to check if given info will lead to a valid configuration
- * @param[in] input Input tensor
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32.
* @param[in] axis Axis to reduce
- * @param[out] output The result of ReduceMax operation
+ * @param[out] output The result of ReduceMax operation. Data types supported: same as @p
+ * input.
* @return a status
*/
static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
ICLTensor *_output;
std::vector<uint32_t> _reduce_axis;
- std::unique_ptr<ICLKernel> _kernel;
+ std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+ std::unique_ptr<CLReduceMaxKernel[]> _reduction_kernels{nullptr};
+ size_t _num_of_kernels;
};
}
#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
*/
#include "helpers.h"
-#if defined(WIDTH)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT)
/** Perform reduce max
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] stops Index of the last input element for each axis
+ * If value is -1, that means the axis is reduced
*/
-__kernel void reduce_max(VECTOR_DECLARATION(input),
- VECTOR_DECLARATION(output))
+__kernel void reduce_max(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int4 stops)
{
- Vector input = CONVERT_TO_VECTOR_STRUCT(input);
- Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+ // TODO Do not use for loop
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
- __global float *input_addr = (__global float *)(input.ptr);
- __global float *output_addr = (__global float *)(output.ptr);
+ DATA_TYPE max_value = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, get_global_id(2) / DEPTH_OUT));
- float max_value = *input_addr;
- for(int x = 1; x < WIDTH; x++)
+ for(int x = (stops.x == -1 ? get_global_id(0) : 0); x <= (stops.x == -1 ? get_global_id(0) : stops.x); ++x)
{
- float value = *(input_addr + x);
- max_value = max(value, max_value);
+ for(int y = (stops.y == -1 ? get_global_id(1) : 0); y <= (stops.y == -1 ? get_global_id(1) : stops.y); ++y)
+ {
+ for(int z = (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : 0); z <= (stops.z == -1 ? get_global_id(2) % DEPTH_OUT : stops.z); ++z)
+ {
+ for(int w = (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : 0); w <= (stops.w == -1 ? get_global_id(2) / DEPTH_OUT : stops.w); ++w)
+ {
+ max_value = max(max_value, *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, z, w)));
+ }
+ }
+ }
}
- // Store max
- *output_addr = max_value;
+ *((__global DATA_TYPE *)out.ptr) = max_value;
}
-#endif // defined(WIDTH)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT)
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t reduce_axis)
+{
+ TensorShape out_shape{input_shape};
+
+ out_shape.set(reduce_axis, 1);
+
+ return out_shape;
+}
+} // namespace
-Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
- const ITensorInfo *output)
+namespace
{
- // We can handle for simple case only
- // Input rank: 2
- // Output rank: 1
- // Axis: one axis value, restrict to 1
+constexpr unsigned int num_elems_processed_per_iteration = 16;
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t reduce_axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+ DataType::U8, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
"Inputs are not broadcast compatible");
- // Validate in case of configured output
- if (output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
- "Output same type allowed for input and output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
- "Only support for output dimension 1");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
- "Only support for input dimension 2");
- }
+ const TensorShape output_shape = inferOutputShape(input->tensor_shape(), reduce_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+ "output shape's size does not match reduce_axis");
const auto num_dimensions = input->tensor_shape().num_dimensions();
- for (size_t i = 0; i < reduce_axis.size(); ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions,
- "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
- }
-
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ reduce_axis >= 0 && reduce_axis < num_dimensions,
+ "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
return Status{};
}
CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {}
-void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis,
- ICLTensor *output)
+void CLReduceMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const uint32_t reduce_axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), reduce_axis));
_input = input;
_output = output;
_reduce_axis = reduce_axis;
- // Configure kernel window
- int cols = _input->info()->tensor_shape()[0];
- int rows = _input->info()->tensor_shape()[1];
- Window win;
- win.set(0, Window::Dimension(0, cols, 1));
- win.set(1, Window::Dimension(0, rows, 1));
+ std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+ output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), reduce_axis));
// Construct kernel name
std::string kernel_name = "reduce_max";
// Set kernel build options
std::set<std::string> build_opts;
- build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
// Create kernel
_kernel =
static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+ // Configure kernel window
+ Window win = calculate_max_window(*output_info, Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output_info->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
ICLKernel::configure(win);
}
-Status CLReduceMaxKernel::validate(const ITensorInfo *input,
- const std::vector<uint32_t> &reduce_axis,
- const ITensorInfo *output)
+Status CLReduceMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const uint32_t reduce_axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, reduce_axis));
return Status{};
}
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window window_input = window;
- Window slice_input = window_input.first_slice_window_1D();
+ const TensorShape &shape_in = _input->info()->tensor_shape();
+
+ unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+ // Initialize as -1 that means the axis is not reduced
+ cl_int4 stops = {{
+ static_cast<cl_int>(-1), static_cast<cl_int>(-1), static_cast<cl_int>(-1),
+ static_cast<cl_int>(-1),
+ }};
+
+ stops.s[_reduce_axis] = static_cast<cl_int>(shape_in[_reduce_axis] - 1);
+
+ _kernel.setArg<cl_int4>(idx++, stops);
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup input slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ // Copy output's shape in order to use for recovering at end of this method
+ const TensorShape shape_out = _output->info()->tensor_shape();
+ _output->info()->set_tensor_shape(inferOutputShape(shape_in, _reduce_axis));
do
{
- Window slice_output = slice_input.shift_dimensions(1);
unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, slice_input);
- add_1D_tensor_argument(idx, _output, slice_output);
- enqueue(queue, *this, slice_input);
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
- } while (window_input.slide_window_slice_1D(slice_input));
+ // Recover output's shape of output tensor
+ _output->info()->set_tensor_shape(shape_out);
}
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#define REDUCE_MAX_RUN_ON_CPU 1
+#include "arm_compute/core/TensorInfo.h"
namespace arm_compute
{
-CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {}
+CLReduceMax::CLReduceMax()
+ : _input(nullptr), _output(nullptr), _reduce_axis(), _interm_tensors(), _reduction_kernels(),
+ _num_of_kernels()
+{
+}
void CLReduceMax::configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output)
{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), reduce_axis, output->info()));
+
_reduce_axis = reduce_axis;
_input = input;
_output = output;
- auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
- k->configure(input, reduce_axis, output);
- _kernel = std::move(k);
+ // NOTE The reduce_axis must have no duplication.
+ _num_of_kernels = reduce_axis.size();
+ const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduction_kernels =
+ arm_compute::support::cpp14::make_unique<CLReduceMaxKernel[]>(_num_of_kernels);
+
+ TensorShape shape{input->info()->tensor_shape()};
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(reduce_axis[i], 1);
+ _interm_tensors[i].allocator()->init(
+ TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _interm_tensors[i].allocator()->allocate();
+ }
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
- // We can handle for simple case only
- // Output rank: 1
- // Axis: one axis value, restrict to 1
- ARM_COMPUTE_ERROR_THROW_ON(
- CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info()));
+ // Apply ReduceMax on all kernels
+ for (size_t i = 0; i < _num_of_kernels; i++)
+ {
+ _reduction_kernels[i].configure(tensors[i], tensors[i + 1], reduce_axis[i]);
+ }
}
Status CLReduceMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
const ITensorInfo *output)
{
- return CLReduceMaxKernel::validate(input, reduce_axis, output);
+ const size_t num_of_kernels = reduce_axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(reduce_axis[i], 1);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate ReduceMax only on all kernels
+ for (size_t i = 0; i < num_of_kernels; i++)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReduceMaxKernel::validate(tensors[i], tensors[i + 1], reduce_axis[i]));
+ }
+
+ return Status{};
}
void CLReduceMax::run()
arm_compute::CLScheduler::get().sync();
#else
- arm_compute::CLScheduler::get().enqueue(*_kernel);
+ for (size_t i = 0; i < _num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels[i]);
+ }
#endif
}
throw std::runtime_error("Not supported");
break;
}
+ std::sort(axis.begin(), axis.end());
+ auto last = std::unique(axis.begin(), axis.end());
+ axis.erase(last, axis.end());
}
// Construct operation parameters