From eb935598902d7a11cb14a5e0edbfc9c4d4ff6f09 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Siva=20Sai=20Vaddipati/System=20SW=20/SRI-Bangalore/Enginee?= =?utf8?q?r/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Mon, 3 Sep 2018 14:58:01 +0530 Subject: [PATCH] Enabling PAD operation in NNAPI (#2097) This commit enables PAD operation in pure_arm_compute runtime Signed-off-by: Siva Sai --- runtimes/pure_arm_compute/src/compilation.cc | 62 ++++++++++++++++++++++ .../src/internal/layers/PadLayer.cc | 62 ++++++++++++++++++++++ .../src/internal/layers/PadLayer.h | 25 +++++++++ .../pure_arm_compute/src/internal/op/NodeVisitor.h | 2 + runtimes/pure_arm_compute/src/internal/op/Pad.cc | 47 ++++++++++++++++ runtimes/pure_arm_compute/src/internal/op/Pad.h | 53 ++++++++++++++++++ runtimes/pure_arm_compute/src/model.cc | 14 +++++ 7 files changed, 265 insertions(+) create mode 100644 runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc create mode 100644 runtimes/pure_arm_compute/src/internal/layers/PadLayer.h create mode 100644 runtimes/pure_arm_compute/src/internal/op/Pad.cc create mode 100644 runtimes/pure_arm_compute/src/internal/op/Pad.h diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index 6912698..1fdc23d 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -50,6 +50,7 @@ #include "internal/layers/SimpleArithmeticAddition.h" #include "internal/layers/SimpleCastLayer.h" #include "internal/layers/GenericFullyConnectedLayer.h" +#include "internal/layers/PadLayer.h" #include "util/matrix/IndexIterator.h" #include "util/kernel/IndexIterator.h" @@ -475,6 +476,7 @@ public: void visit(const ::internal::tflite::op::Floor::Node &node) override; void visit(const ::internal::tflite::op::Split::Node &node) override; void visit(const ::internal::tflite::op::RSQRT::Node &node) override; + void visit(const ::internal::tflite::op::Pad::Node &node) override; private: const ::internal::tflite::operand::Set &_ctx; @@ -3287,6 +3289,66 @@ void Planner::visit(const ::internal::tflite::op::Split::Node &node) // NOTE Split has no actual operation! } +void Planner::visit(const ::internal::tflite::op::Pad::Node &node) +{ + const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index}; + const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index}; + const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(); + const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor(); + + // Set Shape Constraints and TensorInfo + _builder.addShapeConstr( + ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(), + _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint())); + _builder.addShapeConstr( + ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), _ctx.at(ofm_index).type(), + _ctx.at(ofm_index).scale(), _ctx.at(ofm_index).zeroPoint())); + _builder.addShapeConstr( + paddings_index, + asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(), + _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint())); + + // Construct operation parameters + struct Param + { + int ofm_index; + int ifm_index; + int32_t padding_size; + }; + + Param param; + + param.ofm_index = ofm_index.asInt(); + param.ifm_index = ifm_index.asInt(); + + assert(_ctx.at(paddings_index).hasData() == true); + + // TODO: Currently we are supporting uniform padding for the tensor, so only a single + // value is being read. (TOP = BOTTOM = LEFT = RIGHT). + // Need to read padding values for all the sides (TOP, BOTTOM, LEFT & RIGHT) + + const auto &padding_data = _ctx.at(paddings_index).data(); + auto base = padding_data.base(); + auto padsize = reinterpret_cast(base) + 3; + param.padding_size = *padsize; + + auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) { + auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index}); + auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index}); + + auto fn = nnfw::make_unique(); + + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.padding_size); + builder.append("Pad", std::move(fn)); + + }; + + _builder.addStage(stage); +} + class AllocationContext final : public IAllocationContext { public: diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc new file mode 100644 index 0000000..857e077 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc @@ -0,0 +1,62 @@ +#include +#include "PadLayer.h" +#include + +void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output, + unsigned int border_width) +{ + _input = input; + _output = output; + _border_width = border_width; + _output_height = _output->info()->dimension(0); + _output_width = _output->info()->dimension(1); + + uint8_t constant_border_value = 0; + ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value); + + unsigned int padding_size = _border_width; + input->info()->extend_padding(::arm_compute::PaddingSize{padding_size}); + _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT, + constant_pixel_value); +} + +void PadLayer::run(void) +{ + _fillborderkernel.run(); + + ::arm_compute::Coordinates coordinates = + ::arm_compute::Coordinates(-_border_width, -_border_width); + ::arm_compute::TensorShape new_tensor_shape = + ::arm_compute::TensorShape(_output_height, _output_width); + + /* NOTE: The cl kernel fills the data in the borders(not in the tensor). + Once the tensor is received back at NNAPI, we are adjusting + the valid region in such a way that the padding becomes part of the tensor itself + and matches the size of output. */ + _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape)); + + /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does. + We need to map the input (tensor that is passed to the cl kernel) back to + output. */ + + // TODO: Write a modified CLCopy kernel to do this job. + populateOutput(); +} + +void PadLayer::populateOutput() +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + _input->map(queue); + _output->map(queue); + + auto input_tensor = static_cast<::arm_compute::ITensor *>(_input); + auto const source_data = input_tensor->buffer(); + + auto output_tensor = static_cast<::arm_compute::ITensor *>(_output); + auto dst_data = output_tensor->buffer(); + + memmove(dst_data, source_data, _output_height * _output_width * 4); + + _input->unmap(queue); + _output->unmap(queue); +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h new file mode 100644 index 0000000..ba4b851 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h @@ -0,0 +1,25 @@ +#ifndef __PAD_LAYER_H__ +#define __PAD_LAYER_H__ + +#include +#include + +class PadLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output, + unsigned int border_width); + void run(void) override; + +private: + ::arm_compute::ICLTensor *_input; + ::arm_compute::ICLTensor *_output; + int _border_width; + int _output_height; + int _output_width; + + ::arm_compute::CLFillBorder _fillborderkernel; + void populateOutput(); +}; + +#endif // __PAD_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h index b2361dc..1cf1d96 100644 --- a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h +++ b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h @@ -33,6 +33,7 @@ #include "internal/op/Floor.h" #include "internal/op/Split.h" #include "internal/op/RSQRT.h" +#include "internal/op/Pad.h" namespace internal { @@ -81,6 +82,7 @@ struct NodeVisitor virtual void visit(const Floor::Node &) = 0; virtual void visit(const Split::Node &) = 0; virtual void visit(const RSQRT::Node &) = 0; + virtual void visit(const Pad::Node &) = 0; }; } // namespace op diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.cc b/runtimes/pure_arm_compute/src/internal/op/Pad.cc new file mode 100644 index 0000000..10b5521 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/op/Pad.cc @@ -0,0 +1,47 @@ +#include "internal/op/Pad.h" +#include "internal/op/NodeVisitor.h" + +#include + +namespace internal +{ +namespace tflite +{ +namespace op +{ +namespace Pad +{ + +void Node::accept(NodeVisitor &&v) const { v.visit(*this); } + +} // namespace Pad +} // namespace op +} // namespace tflite +} // namespace internal + +namespace internal +{ +namespace tflite +{ +namespace op +{ +namespace Pad +{ + +Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, + const uint32_t *outputs) +{ + assert(inputCount == 2 && outputCount == 1); + ofm_index = outputs[0]; + + // Each input should be interpreted as follows: + // + // 0 -> input Tensor Index + // 1 -> paddings + ifm_index = inputs[0]; + paddings_index = inputs[1]; +} +} // namespace Pad +} // namespace op +} // namespace tflite +} // namespace internal diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.h b/runtimes/pure_arm_compute/src/internal/op/Pad.h new file mode 100644 index 0000000..410afb1 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/op/Pad.h @@ -0,0 +1,53 @@ +#ifndef __INTERNAL_OP_PAD_H__ +#define __INTERNAL_OP_PAD_H__ + +#include "internal/op/Node.h" + +#include + +namespace internal +{ +namespace tflite +{ +namespace op +{ +namespace Pad +{ + +struct Param +{ + int32_t ifm_index; + int32_t paddings_index; + int32_t ofm_index; + + Param() = default; + Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, const uint32_t *outputs); +}; + +class Node final : public op::Node +{ +public: + Node(const Param ¶m) : _param(param) + { + // DO NOTHING + } + +public: + virtual ~Node() = default; + +public: + const Param ¶m(void) const { return _param; } + +public: + void accept(NodeVisitor &&) const override; + +private: + const Param _param; +}; + +} // namespace Pad +} // namespace op +} // namespace tflite +} // namespace internal + +#endif // __INTERNAL_OP_PAD_H_ diff --git a/runtimes/pure_arm_compute/src/model.cc b/runtimes/pure_arm_compute/src/model.cc index 8f4334d..6663c1e 100644 --- a/runtimes/pure_arm_compute/src/model.cc +++ b/runtimes/pure_arm_compute/src/model.cc @@ -554,6 +554,20 @@ int ANeuralNetworksModel_addOperation(ANeuralNetworksModel *model, break; } + case ANEURALNETWORKS_PAD: + { + assert(inputCount == 2 && outputCount == 1); + + using internal::tflite::op::Pad::Param; + using internal::tflite::op::Pad::Node; + + // Add 'operations' + auto &operations = model->deref().operations(); + + operations.emplace_back(Param{inputCount, inputs, outputCount, outputs}); + + break; + } default: throw std::runtime_error{"Not supported operation"}; }; -- 2.7.4