From eb935598902d7a11cb14a5e0edbfc9c4d4ff6f09 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Siva=20Sai=20Vaddipati/System=20SW=20/SRI-Bangalore/Enginee?=
 =?utf8?q?r/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <siva.sai@samsung.com>
Date: Mon, 3 Sep 2018 14:58:01 +0530
Subject: [PATCH] Enabling PAD operation in NNAPI (#2097)

This commit enables PAD operation in pure_arm_compute runtime

Signed-off-by: Siva Sai <siva.sai@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       | 62 ++++++++++++++++++++++
 .../src/internal/layers/PadLayer.cc                | 62 ++++++++++++++++++++++
 .../src/internal/layers/PadLayer.h                 | 25 +++++++++
 .../pure_arm_compute/src/internal/op/NodeVisitor.h |  2 +
 runtimes/pure_arm_compute/src/internal/op/Pad.cc   | 47 ++++++++++++++++
 runtimes/pure_arm_compute/src/internal/op/Pad.h    | 53 ++++++++++++++++++
 runtimes/pure_arm_compute/src/model.cc             | 14 +++++
 7 files changed, 265 insertions(+)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
 create mode 100644 runtimes/pure_arm_compute/src/internal/op/Pad.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/op/Pad.h

diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 6912698..1fdc23d 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -50,6 +50,7 @@
 #include "internal/layers/SimpleArithmeticAddition.h"
 #include "internal/layers/SimpleCastLayer.h"
 #include "internal/layers/GenericFullyConnectedLayer.h"
+#include "internal/layers/PadLayer.h"
 
 #include "util/matrix/IndexIterator.h"
 #include "util/kernel/IndexIterator.h"
@@ -475,6 +476,7 @@ public:
   void visit(const ::internal::tflite::op::Floor::Node &node) override;
   void visit(const ::internal::tflite::op::Split::Node &node) override;
   void visit(const ::internal::tflite::op::RSQRT::Node &node) override;
+  void visit(const ::internal::tflite::op::Pad::Node &node) override;
 
 private:
   const ::internal::tflite::operand::Set &_ctx;
@@ -3287,6 +3289,66 @@ void Planner::visit(const ::internal::tflite::op::Split::Node &node)
   // NOTE Split has no actual operation!
 }
 
+void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
+{
+  const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index};
+  const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
+  const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor();
+
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(
+      ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
+                              _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
+  _builder.addShapeConstr(
+      ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), _ctx.at(ofm_index).type(),
+                              _ctx.at(ofm_index).scale(), _ctx.at(ofm_index).zeroPoint()));
+  _builder.addShapeConstr(
+      paddings_index,
+      asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(),
+                   _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int ofm_index;
+    int ifm_index;
+    int32_t padding_size;
+  };
+
+  Param param;
+
+  param.ofm_index = ofm_index.asInt();
+  param.ifm_index = ifm_index.asInt();
+
+  assert(_ctx.at(paddings_index).hasData() == true);
+
+  // TODO: Currently we are supporting uniform padding for the tensor, so only a single
+  //      value is being read. (TOP = BOTTOM = LEFT = RIGHT).
+  //      Need to read padding values for all the sides (TOP, BOTTOM, LEFT & RIGHT)
+
+  const auto &padding_data = _ctx.at(paddings_index).data();
+  auto base = padding_data.base();
+  auto padsize = reinterpret_cast<const int *>(base) + 3;
+  param.padding_size = *padsize;
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
+    auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
+
+    auto fn = nnfw::make_unique<PadLayer>();
+
+    fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.padding_size);
+    builder.append("Pad", std::move(fn));
+
+  };
+
+  _builder.addStage(stage);
+}
+
 class AllocationContext final : public IAllocationContext
 {
 public:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
new file mode 100644
index 0000000..857e077
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
@@ -0,0 +1,62 @@
+#include <iostream>
+#include "PadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+                         unsigned int border_width)
+{
+  _input = input;
+  _output = output;
+  _border_width = border_width;
+  _output_height = _output->info()->dimension(0);
+  _output_width = _output->info()->dimension(1);
+
+  uint8_t constant_border_value = 0;
+  ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
+
+  unsigned int padding_size = _border_width;
+  input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
+  _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
+                              constant_pixel_value);
+}
+
+void PadLayer::run(void)
+{
+  _fillborderkernel.run();
+
+  ::arm_compute::Coordinates coordinates =
+      ::arm_compute::Coordinates(-_border_width, -_border_width);
+  ::arm_compute::TensorShape new_tensor_shape =
+      ::arm_compute::TensorShape(_output_height, _output_width);
+
+  /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
+           Once the tensor is received back at NNAPI, we are adjusting
+           the valid region in such a way that the padding becomes part of the tensor itself
+           and matches the size of output. */
+  _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
+
+  /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
+           We need to map the input (tensor that is passed to the cl kernel) back to
+           output. */
+
+  // TODO: Write a modified CLCopy kernel to do this job.
+  populateOutput();
+}
+
+void PadLayer::populateOutput()
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  _input->map(queue);
+  _output->map(queue);
+
+  auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
+  auto const source_data = input_tensor->buffer();
+
+  auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
+  auto dst_data = output_tensor->buffer();
+
+  memmove(dst_data, source_data, _output_height * _output_width * 4);
+
+  _input->unmap(queue);
+  _output->unmap(queue);
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
new file mode 100644
index 0000000..ba4b851
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
@@ -0,0 +1,25 @@
+#ifndef __PAD_LAYER_H__
+#define __PAD_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
+
+class PadLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+                 unsigned int border_width);
+  void run(void) override;
+
+private:
+  ::arm_compute::ICLTensor *_input;
+  ::arm_compute::ICLTensor *_output;
+  int _border_width;
+  int _output_height;
+  int _output_width;
+
+  ::arm_compute::CLFillBorder _fillborderkernel;
+  void populateOutput();
+};
+
+#endif // __PAD_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
index b2361dc..1cf1d96 100644
--- a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
+++ b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
@@ -33,6 +33,7 @@
 #include "internal/op/Floor.h"
 #include "internal/op/Split.h"
 #include "internal/op/RSQRT.h"
+#include "internal/op/Pad.h"
 
 namespace internal
 {
@@ -81,6 +82,7 @@ struct NodeVisitor
   virtual void visit(const Floor::Node &) = 0;
   virtual void visit(const Split::Node &) = 0;
   virtual void visit(const RSQRT::Node &) = 0;
+  virtual void visit(const Pad::Node &) = 0;
 };
 
 } // namespace op
diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.cc b/runtimes/pure_arm_compute/src/internal/op/Pad.cc
new file mode 100644
index 0000000..10b5521
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/op/Pad.cc
@@ -0,0 +1,47 @@
+#include "internal/op/Pad.h"
+#include "internal/op/NodeVisitor.h"
+
+#include <cassert>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Pad
+{
+
+void Node::accept(NodeVisitor &&v) const { v.visit(*this); }
+
+} // namespace Pad
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Pad
+{
+
+Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount,
+             const uint32_t *outputs)
+{
+  assert(inputCount == 2 && outputCount == 1);
+  ofm_index = outputs[0];
+
+  // Each input should be interpreted as follows:
+  //
+  //  0 -> input Tensor Index
+  //  1 -> paddings
+  ifm_index = inputs[0];
+  paddings_index = inputs[1];
+}
+} // namespace Pad
+} // namespace op
+} // namespace tflite
+} // namespace internal
diff --git a/runtimes/pure_arm_compute/src/internal/op/Pad.h b/runtimes/pure_arm_compute/src/internal/op/Pad.h
new file mode 100644
index 0000000..410afb1
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/op/Pad.h
@@ -0,0 +1,53 @@
+#ifndef __INTERNAL_OP_PAD_H__
+#define __INTERNAL_OP_PAD_H__
+
+#include "internal/op/Node.h"
+
+#include <cstdint>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Pad
+{
+
+struct Param
+{
+  int32_t ifm_index;
+  int32_t paddings_index;
+  int32_t ofm_index;
+
+  Param() = default;
+  Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, const uint32_t *outputs);
+};
+
+class Node final : public op::Node
+{
+public:
+  Node(const Param &param) : _param(param)
+  {
+    // DO NOTHING
+  }
+
+public:
+  virtual ~Node() = default;
+
+public:
+  const Param &param(void) const { return _param; }
+
+public:
+  void accept(NodeVisitor &&) const override;
+
+private:
+  const Param _param;
+};
+
+} // namespace Pad
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+#endif // __INTERNAL_OP_PAD_H_
diff --git a/runtimes/pure_arm_compute/src/model.cc b/runtimes/pure_arm_compute/src/model.cc
index 8f4334d..6663c1e 100644
--- a/runtimes/pure_arm_compute/src/model.cc
+++ b/runtimes/pure_arm_compute/src/model.cc
@@ -554,6 +554,20 @@ int ANeuralNetworksModel_addOperation(ANeuralNetworksModel *model,
 
       break;
     }
+    case ANEURALNETWORKS_PAD:
+    {
+      assert(inputCount == 2 && outputCount == 1);
+
+      using internal::tflite::op::Pad::Param;
+      using internal::tflite::op::Pad::Node;
+
+      // Add 'operations'
+      auto &operations = model->deref().operations();
+
+      operations.emplace_back<Node>(Param{inputCount, inputs, outputCount, outputs});
+
+      break;
+    }
     default:
       throw std::runtime_error{"Not supported operation"};
   };
-- 
2.7.4