Enabling PAD operation in NNAPI (#2097)
authorSiva Sai Vaddipati/System SW /SRI-Bangalore/Engineer/삼성전자 <siva.sai@samsung.com>
Mon, 3 Sep 2018 09:28:01 +0000 (14:58 +0530)
committer오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Mon, 3 Sep 2018 09:28:01 +0000 (18:28 +0900)
This commit enables PAD operation in pure_arm_compute runtime

Signed-off-by: Siva Sai <siva.sai@samsung.com>
runtimes/pure_arm_compute/src/compilation.cc
runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc [new file with mode: 0644]
runtimes/pure_arm_compute/src/internal/layers/PadLayer.h [new file with mode: 0644]
runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
runtimes/pure_arm_compute/src/internal/op/Pad.cc [new file with mode: 0644]
runtimes/pure_arm_compute/src/internal/op/Pad.h [new file with mode: 0644]
runtimes/pure_arm_compute/src/model.cc

index 6912698..1fdc23d 100644 (file)
@@ -50,6 +50,7 @@
 #include "internal/layers/SimpleArithmeticAddition.h"
 #include "internal/layers/SimpleCastLayer.h"
 #include "internal/layers/GenericFullyConnectedLayer.h"
+#include "internal/layers/PadLayer.h"
 
 #include "util/matrix/IndexIterator.h"
 #include "util/kernel/IndexIterator.h"
@@ -475,6 +476,7 @@ public:
   void visit(const ::internal::tflite::op::Floor::Node &node) override;
   void visit(const ::internal::tflite::op::Split::Node &node) override;
   void visit(const ::internal::tflite::op::RSQRT::Node &node) override;
+  void visit(const ::internal::tflite::op::Pad::Node &node) override;
 
 private:
   const ::internal::tflite::operand::Set &_ctx;
@@ -3287,6 +3289,66 @@ void Planner::visit(const ::internal::tflite::op::Split::Node &node)
   // NOTE Split has no actual operation!
 }
 
+void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
+{
+  const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index};
+  const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
+  const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor();
+
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(
+      ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
+                              _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
+  _builder.addShapeConstr(
+      ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), _ctx.at(ofm_index).type(),
+                              _ctx.at(ofm_index).scale(), _ctx.at(ofm_index).zeroPoint()));
+  _builder.addShapeConstr(
+      paddings_index,
+      asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(),
+                   _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int ofm_index;
+    int ifm_index;
+    int32_t padding_size;
+  };
+
+  Param param;
+
+  param.ofm_index = ofm_index.asInt();
+  param.ifm_index = ifm_index.asInt();
+
+  assert(_ctx.at(paddings_index).hasData() == true);
+
+  // TODO: Currently we are supporting uniform padding for the tensor, so only a single
+  //      value is being read. (TOP = BOTTOM = LEFT = RIGHT).
+  //      Need to read padding values for all the sides (TOP, BOTTOM, LEFT & RIGHT)
+
+  const auto &padding_data = _ctx.at(paddings_index).data();
+  auto base = padding_data.base();
+  auto padsize = reinterpret_cast<const int *>(base) + 3;
+  param.padding_size = *padsize;
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
+    auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
+
+    auto fn = nnfw::make_unique<PadLayer>();
+
+    fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.padding_size);
+    builder.append("Pad", std::move(fn));
+
+  };
+
+  _builder.addStage(stage);
+}
+
 class AllocationContext final : public IAllocationContext
 {
 public:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
new file mode 100644 (file)
index 0000000..857e077
--- /dev/null
@@ -0,0 +1,62 @@
+#include <iostream>\r
+#include "PadLayer.h"\r
+#include <arm_compute/runtime/CL/CLScheduler.h>\r
+\r
+void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,\r
+                         unsigned int border_width)\r
+{\r
+  _input = input;\r
+  _output = output;\r
+  _border_width = border_width;\r
+  _output_height = _output->info()->dimension(0);\r
+  _output_width = _output->info()->dimension(1);\r
+\r
+  uint8_t constant_border_value = 0;\r
+  ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);\r
+\r
+  unsigned int padding_size = _border_width;\r
+  input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});\r
+  _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,\r
+                              constant_pixel_value);\r
+}\r
+\r
+void PadLayer::run(void)\r
+{\r
+  _fillborderkernel.run();\r
+\r
+  ::arm_compute::Coordinates coordinates =\r
+      ::arm_compute::Coordinates(-_border_width, -_border_width);\r
+  ::arm_compute::TensorShape new_tensor_shape =\r
+      ::arm_compute::TensorShape(_output_height, _output_width);\r
+\r
+  /* NOTE: The cl kernel fills the data in the borders(not in the tensor).\r
+           Once the tensor is received back at NNAPI, we are adjusting\r
+           the valid region in such a way that the padding becomes part of the tensor itself\r
+           and matches the size of output. */\r
+  _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));\r
+\r
+  /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.\r
+           We need to map the input (tensor that is passed to the cl kernel) back to\r
+           output. */\r
+\r
+  // TODO: Write a modified CLCopy kernel to do this job.\r
+  populateOutput();\r
+}\r
+\r
+void PadLayer::populateOutput()\r
+{\r
+  auto &queue = ::arm_compute::CLScheduler::get().queue();\r
+  _input->map(queue);\r
+  _output->map(queue);\r
+\r
+  auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);\r
+  auto const source_data = input_tensor->buffer();\r
+\r
+  auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);\r
+  auto dst_data = output_tensor->buffer();\r
+\r
+  memmove(dst_data, source_data, _output_height * _output_width * 4);\r
+\r
+  _input->unmap(queue);\r
+  _output->unmap(queue);\r
+}\r
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
new file mode 100644 (file)
index 0000000..ba4b851
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef __PAD_LAYER_H__\r
+#define __PAD_LAYER_H__\r
+\r
+#include <arm_compute/runtime/CL/CLTensor.h>\r
+#include <arm_compute/runtime/CL/functions/CLFillBorder.h>\r
+\r
+class PadLayer : public ::arm_compute::IFunction\r
+{\r
+public:\r
+  void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,\r
+                 unsigned int border_width);\r
+  void run(void) override;\r
+\r
+private:\r
+  ::arm_compute::ICLTensor *_input;\r
+  ::arm_compute::ICLTensor *_output;\r
+  int _border_width;\r
+  int _output_height;\r
+  int _output_width;\r
+\r
+  ::arm_compute::CLFillBorder _fillborderkernel;\r
+  void populateOutput();\r
+};\r
+\r
+#endif // __PAD_LAYER_H__\r
index b2361dc..1cf1d96 100644 (file)
@@ -33,6 +33,7 @@
 #include "internal/op/Floor.h"
 #include "internal/op/Split.h"
 #include "internal/op/RSQRT.h"
+#include "internal/op/Pad.h"
 
 namespace internal
 {
@@ -81,6 +82,7 @@ struct NodeVisitor
   virtual void visit(const Floor::Node &) = 0;
   virtual void visit(const Split::Node &) = 0;
   virtual void visit(const RSQRT::Node &) = 0;
+  virtual void visit(const Pad::Node &) = 0;
 };
 
 } // namespace op
diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.cc b/runtimes/pure_arm_compute/src/internal/op/Pad.cc
new file mode 100644 (file)
index 0000000..10b5521
--- /dev/null
@@ -0,0 +1,47 @@
+#include "internal/op/Pad.h"\r
+#include "internal/op/NodeVisitor.h"\r
+\r
+#include <cassert>\r
+\r
+namespace internal\r
+{\r
+namespace tflite\r
+{\r
+namespace op\r
+{\r
+namespace Pad\r
+{\r
+\r
+void Node::accept(NodeVisitor &&v) const { v.visit(*this); }\r
+\r
+} // namespace Pad\r
+} // namespace op\r
+} // namespace tflite\r
+} // namespace internal\r
+\r
+namespace internal\r
+{\r
+namespace tflite\r
+{\r
+namespace op\r
+{\r
+namespace Pad\r
+{\r
+\r
+Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount,\r
+             const uint32_t *outputs)\r
+{\r
+  assert(inputCount == 2 && outputCount == 1);\r
+  ofm_index = outputs[0];\r
+\r
+  // Each input should be interpreted as follows:\r
+  //\r
+  //  0 -> input Tensor Index\r
+  //  1 -> paddings\r
+  ifm_index = inputs[0];\r
+  paddings_index = inputs[1];\r
+}\r
+} // namespace Pad\r
+} // namespace op\r
+} // namespace tflite\r
+} // namespace internal\r
diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.h b/runtimes/pure_arm_compute/src/internal/op/Pad.h
new file mode 100644 (file)
index 0000000..410afb1
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef __INTERNAL_OP_PAD_H__\r
+#define __INTERNAL_OP_PAD_H__\r
+\r
+#include "internal/op/Node.h"\r
+\r
+#include <cstdint>\r
+\r
+namespace internal\r
+{\r
+namespace tflite\r
+{\r
+namespace op\r
+{\r
+namespace Pad\r
+{\r
+\r
+struct Param\r
+{\r
+  int32_t ifm_index;\r
+  int32_t paddings_index;\r
+  int32_t ofm_index;\r
+\r
+  Param() = default;\r
+  Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, const uint32_t *outputs);\r
+};\r
+\r
+class Node final : public op::Node\r
+{\r
+public:\r
+  Node(const Param &param) : _param(param)\r
+  {\r
+    // DO NOTHING\r
+  }\r
+\r
+public:\r
+  virtual ~Node() = default;\r
+\r
+public:\r
+  const Param &param(void) const { return _param; }\r
+\r
+public:\r
+  void accept(NodeVisitor &&) const override;\r
+\r
+private:\r
+  const Param _param;\r
+};\r
+\r
+} // namespace Pad\r
+} // namespace op\r
+} // namespace tflite\r
+} // namespace internal\r
+\r
+#endif // __INTERNAL_OP_PAD_H_\r
index 8f4334d..6663c1e 100644 (file)
@@ -554,6 +554,20 @@ int ANeuralNetworksModel_addOperation(ANeuralNetworksModel *model,
 
       break;
     }
+    case ANEURALNETWORKS_PAD:
+    {
+      assert(inputCount == 2 && outputCount == 1);
+
+      using internal::tflite::op::Pad::Param;
+      using internal::tflite::op::Pad::Node;
+
+      // Add 'operations'
+      auto &operations = model->deref().operations();
+
+      operations.emplace_back<Node>(Param{inputCount, inputs, outputCount, outputs});
+
+      break;
+    }
     default:
       throw std::runtime_error{"Not supported operation"};
   };