This commit supports ReduceMax operation up to 4 dimentions.
Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
/** Initialise the kernel's input, output and border mode.
*
- * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] axis Axis to reduce
- * @param[out] output The output tensor, Data types supported: same as @p input1. Note:
- * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] input An input tensor. Data types supported: QASYMM8/S32/F32.
+ * @param[in] reduce_axis Axises to reduce
+ * @param[out] output The output tensor, Data types supported: same as @p input.
*/
- void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
+ void configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref
* CLReduceMaxKernel
*
- * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] axis Axis to reduce
- * @param[in] output The output tensor info, Data types supported: same as @p input1.
- * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] input An input tensor info. Data types supported: QASYMM8/S32/F32.
+ * @param[in] reduce_axis Axises to reduce
+ * @param[in] output The output tensor info, Data types supported: same as @p input.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
const ICLTensor *_input;
ICLTensor *_output;
- int32_t _axis;
+ std::vector<uint32_t> _reduce_axis;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
* @note When locations of min and max occurrences are requested, the reported number of locations
* is limited to the given array size.
*
- * @param[in] input Input image. Data types supported: F32
- * @param[in] axis Axis to reduce. Data type supported: S32
- * @param[out] output indices related to top k values. Data types supported: F32.
+ * @param[in] input Input tensor. Data types supported: QASYMM8/S32/F32
+ * @param[in] reduce_axis Axises to reduce.
+ * @param[out] output Output tensor. Data types supported: Same as @p input.
*/
- void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
+ void configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref
* CLPixelWiseDivision
*
- * @param[in] input Input image. Data types supported: F32
- * @param[in] axis Axis to reduce. Data type supported: S32
- * @param[out] output indices related to top k values. Data types supported: F32. *
+ * @param[in] input Input tensor info. Data types supported: QASYMM8/S32/F32
+ * @param[in] reduce_axis Axises to reduce.
+ * @param[out] output Output tensor info. Data types supported: Same as @p input.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run() override;
private:
void run_on_cpu();
- int32_t _axis;
-
ICLTensor *_input;
ICLTensor *_output;
+ std::vector<uint32_t> _reduce_axis;
std::unique_ptr<ICLKernel> _kernel;
};
{
constexpr unsigned int num_elems_processed_per_iteration = 16;
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+ const ITensorInfo *output)
{
// We can handle for simple case only
// Input rank: 2
// Axis: one axis value, restrict to 1
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
"Inputs are not broadcast compatible");
"Only support for input dimension 2");
}
+ const auto num_dimensions = input->tensor_shape().num_dimensions();
+ for (size_t i = 0; i < reduce_axis.size(); ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ reduce_axis[i] >= 0 && reduce_axis[i] < num_dimensions,
+ "reduce_axis must be greater than or equal to 0 and less than (input's rank).");
+ }
+
return Status{};
}
} // namespace
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
+CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _reduce_axis() {}
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
+void CLReduceMaxKernel::configure(const ICLTensor *input, std::vector<uint32_t> reduce_axis,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), reduce_axis, output->info()));
_input = input;
_output = output;
- _axis = axis;
+ _reduce_axis = reduce_axis;
// Configure kernel window
int cols = _input->info()->tensor_shape()[0];
ICLKernel::configure(win);
}
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
+Status CLReduceMaxKernel::validate(const ITensorInfo *input,
+ const std::vector<uint32_t> &reduce_axis,
const ITensorInfo *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, reduce_axis, output));
return Status{};
}
namespace arm_compute
{
-CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
+CLReduceMax::CLReduceMax() : _input(nullptr), _output(nullptr), _reduce_axis(), _kernel(nullptr) {}
-void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
+void CLReduceMax::configure(ICLTensor *input, std::vector<uint32_t> reduce_axis, ICLTensor *output)
{
- _axis = axis;
+ _reduce_axis = reduce_axis;
_input = input;
_output = output;
auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
- k->configure(input, axis, output);
+ k->configure(input, reduce_axis, output);
_kernel = std::move(k);
// We can handle for simple case only
// Output rank: 1
// Axis: one axis value, restrict to 1
- ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
- ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
- ARM_COMPUTE_ERROR_ON(axis != 1);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CLReduceMaxKernel::validate(input->info(), reduce_axis, output->info()));
}
-Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+Status CLReduceMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &reduce_axis,
+ const ITensorInfo *output)
{
- return CLReduceMaxKernel::validate(input, axis, output);
+ return CLReduceMaxKernel::validate(input, reduce_axis, output);
}
void CLReduceMax::run()
#endif
}
-void CLReduceMax::run_on_cpu()
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
{
- cl::CommandQueue q = CLScheduler::get().queue();
+ int32_t offset = b * shape[2] * shape[1] * shape[0];
+ offset += d * shape[1] * shape[0];
+ offset += h * shape[0];
+ offset += w;
+ return offset;
+}
- _input->map(q);
- _output->map(q);
+inline const TensorShape inferOutputShape(const TensorShape &input_shape,
+ const std::vector<uint32_t> &reduce_axis)
+{
+ TensorShape out_shape{};
- // Compute by CPU for simple case
- // Input rank: 2
- // Output rank: 1
- // Axis: one axis value, restrict to 1
+ bool keep_axis[4] = {true, true, true, true};
- float *input_data = (float *)_input->buffer();
- float *output_data = (float *)_output->buffer();
+ for (int i = 0; i < reduce_axis.size(); ++i)
+ {
+ auto axis = reduce_axis[i];
+ keep_axis[axis] = false;
+ }
+
+ for (int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ size_t dim = 1;
+ if (keep_axis[i])
+ {
+ dim = input_shape[i];
+ }
+ out_shape.set(i, dim);
+ }
- std::vector<float> container_max;
- int cols = _input->info()->tensor_shape()[0];
- int rows = _input->info()->tensor_shape()[1];
- container_max.resize(rows);
+ return out_shape;
+}
- // Initialize as 1st element in row
- float *input_pointer = input_data;
- for (int i = 0; i < rows; i++)
+template <typename T>
+inline T getReduceMax(const T *input_data, const TensorShape &input_shape,
+ const TensorShape &output_shape, const size_t b, const size_t d,
+ const size_t h, const size_t w)
+{
+ T max_value = input_data[offset4D(input_shape, b, d, h, w)];
+
+ // If output[dimention] == 1, will check all values of that dimension because of reducing
+ // dimension.
+ // Else will check only one value.
+ const size_t start_b = output_shape[3] == 1 ? 0 : b;
+ const size_t start_d = output_shape[2] == 1 ? 0 : d;
+ const size_t start_h = output_shape[1] == 1 ? 0 : h;
+ const size_t start_w = output_shape[0] == 1 ? 0 : w;
+ const size_t stop_b = output_shape[3] == 1 ? input_shape[3] - 1 : b;
+ const size_t stop_d = output_shape[2] == 1 ? input_shape[2] - 1 : d;
+ const size_t stop_h = output_shape[1] == 1 ? input_shape[1] - 1 : h;
+ const size_t stop_w = output_shape[0] == 1 ? input_shape[0] - 1 : w;
+ for (size_t in_b = start_b; in_b <= stop_b; ++in_b)
{
- container_max[i] = *input_pointer;
- input_pointer += cols;
+ for (size_t in_d = start_d; in_d <= stop_d; ++in_d)
+ {
+ for (size_t in_h = start_h; in_h <= stop_h; ++in_h)
+ {
+ for (size_t in_w = start_w; in_w <= stop_w; ++in_w)
+ {
+ max_value =
+ std::max<T>(max_value, input_data[offset4D(input_shape, in_b, in_d, in_h, in_w)]);
+ }
+ }
+ }
}
- // Update max value in row
- for (int i = 0; i < rows; i++)
+ return max_value;
+}
+
+template <typename T>
+inline void reduceMax(const T *input_data, const TensorShape &input_shape,
+ const TensorShape &output_shape, T *output_data)
+{
+ for (size_t out_b = 0; out_b < output_shape[3]; ++out_b)
{
- float max_in_row = container_max[i];
- for (int j = 1; j < cols; j++)
+ for (size_t out_d = 0; out_d < output_shape[2]; ++out_d)
{
- if (max_in_row < input_data[i * cols + j])
+ for (size_t out_h = 0; out_h < output_shape[1]; ++out_h)
{
- max_in_row = input_data[i * cols + j];
+ for (size_t out_w = 0; out_w < output_shape[0]; ++out_w)
+ {
+ output_data[offset4D(output_shape, out_b, out_d, out_h, out_w)] =
+ getReduceMax(input_data, input_shape, output_shape, out_b, out_d, out_h, out_w);
+ }
}
}
- container_max[i] = max_in_row;
}
+}
+
+void CLReduceMax::run_on_cpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ _input->map(q);
+ _output->map(q);
+
+ TensorShape input_shape = _input->info()->tensor_shape();
+ TensorShape output_shape = inferOutputShape(input_shape, _reduce_axis);
+
+ // NOTE The param input_dims and output_dims's num_dimensions can be less 4.
+ // However we should suppose the num_dimensions always are 4 to support up to 4.
+ input_shape.set_num_dimensions(4);
+ output_shape.set_num_dimensions(4);
- for (int i = 0; i < rows; i++)
+ switch (_input->info()->data_type())
{
- output_data[i] = container_max[i];
+ case DataType::QASYMM8:
+ reduceMax(reinterpret_cast<const uint8_t *>(_input->buffer()), input_shape, output_shape,
+ reinterpret_cast<uint8_t *>(_output->buffer()));
+ break;
+ case DataType::S32:
+ reduceMax(reinterpret_cast<const int32_t *>(_input->buffer()), input_shape, output_shape,
+ reinterpret_cast<int32_t *>(_output->buffer()));
+ break;
+ case DataType::F32:
+ reduceMax(reinterpret_cast<const float *>(_input->buffer()), input_shape, output_shape,
+ reinterpret_cast<float *>(_output->buffer()));
+ break;
+ defualt:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
}
_input->unmap(q);
const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
const ::internal::tflite::operand::Index axis_index{node.param().axis_index};
- // Handle special case only:
- // Input: Matrix (rank 2)
- // Output: Vector (rank 1)
- // Axis: one element (scalar or rank 1 with 1 element), constant
auto ifm_shape = _ctx.at(ifm_index).shape();
auto ofm_shape = _ctx.at(ofm_index).shape();
auto axis_shape = _ctx.at(axis_index).shape();
- assert(ofm_shape.rank() == 1);
- assert(ifm_shape.rank() == 2);
+ assert(ifm_shape.rank() <= 4);
+ assert(ofm_shape.rank() <= ifm_shape.rank());
assert(_ctx.at(axis_index).hasData());
- assert(axis_shape.rank() == 0 || ((axis_shape.rank() == 1) && (axis_shape.dim(0) == 1)));
+ assert(axis_shape.rank() == 0 || axis_shape.rank() == 1);
+ if (!(ifm_shape.rank() == ofm_shape.rank()))
+ {
+ // The input's rank always is greater or equal to output's rank.
+ // So output's rank have to be extended.
+ const_cast<::internal::tflite::operand::Shape &>(_ctx.at(ofm_index).shape())
+ .extendRank(ifm_shape.rank());
+ }
_builder.addShapeConstr(ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()),
_ctx.at(ofm_index).type()));
_builder.addShapeConstr(ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()),
_ctx.at(ifm_index).type()));
- // Note: Assume only one element in axis. It is checked by assertion above
- // TODO: handle general case
- // Axis is integer value (generally, int32)
- int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
- assert(axis_value == 1);
+ std::vector<uint32_t> axis;
+ {
+ const auto ifm_rank = ifm_shape.rank();
+ switch (axis_shape.rank())
+ {
+ case 0: // scalar
+ {
+ int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+ if (axis_value < 0)
+ {
+ axis_value += ifm_rank;
+ }
+ axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+ break;
+ }
+ case 1: // vector
+ {
+ const auto axis_base = _ctx.at(axis_index).data().base();
+ const auto axis_size = _ctx.at(axis_index).shape().asVector();
+
+ for (uint32_t n = 0; n < axis_size; ++n)
+ {
+ int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+ if (axis_value < 0)
+ {
+ axis_value += ifm_rank;
+ }
+ axis.push_back(ToARMComputeAxis(ifm_rank, axis_value).value());
+ }
+ break;
+ }
+ default:
+ throw std::runtime_error("Not supported");
+ break;
+ }
+ }
// Construct operation parameters
struct Param
{
int ofm_index;
int ifm_index;
-
- int32_t axis;
+ std::vector<uint32_t> axis;
};
Param param;
param.ofm_index = ofm_index.asInt();
param.ifm_index = ifm_index.asInt();
- param.axis = axis_value;
+ param.axis = axis;
auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});