From 89ca51722c3b9ddc477b9df54132769c5003e40b Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EB=B0=95=EC=A2=85=ED=98=84/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Senior=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <jh1302.park@samsung.com>
Date: Mon, 16 Apr 2018 12:35:59 +0900
Subject: [PATCH] [Pure ACL Runtime] Support 'Concat' operation (#693)

This commit introduces partial support on ANEURALNETWORKS_CONCATENATION
operation.

Only feature map concat over depth axis is supported currently.

Signed-off-by: Jonghyun Park <jh1302.park@samsung.com>
---
 .../bindings/pure_arm_compute/src/compilation.cc   | 162 ++++++++++++++++++++-
 .../pure_arm_compute/src/internal/arm_compute.h    |   6 +
 .../pure_arm_compute/src/internal/op/Concat.cc     |  53 +++++++
 .../pure_arm_compute/src/internal/op/Concat.h      |  56 +++++++
 .../pure_arm_compute/src/internal/op/NodeVisitor.h |   2 +
 .../bindings/pure_arm_compute/src/model.cc         |  12 ++
 6 files changed, 287 insertions(+), 4 deletions(-)
 create mode 100644 tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.cc
 create mode 100644 tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.h

diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/compilation.cc b/tools/nnapi_bindings/bindings/pure_arm_compute/src/compilation.cc
index 92eb5b2..82f2376 100644
--- a/tools/nnapi_bindings/bindings/pure_arm_compute/src/compilation.cc
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/compilation.cc
@@ -4,6 +4,7 @@
 
 #include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
 #include <arm_compute/runtime/CL/functions/CLPoolingLayer.h>
 #include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
 
@@ -155,6 +156,10 @@ struct IPlanBuilder
 
   virtual void addShapeConstr(const ::internal::tflite::operand::Index &ind,
                               const ::arm_compute::TensorInfo &info) = 0;
+  virtual void addSubsumptionConstr(const ::internal::tflite::operand::Index &ind,
+                                    const ::internal::tflite::operand::Index &base,
+                                    const ::arm_compute::Coordinates &offset,
+                                    const ::arm_compute::TensorShape &shape) = 0;
   virtual void addInitializer(const ::internal::tflite::operand::Index &ind,
                               const Initializer &initializer) = 0;
   virtual void addStage(const Stage &) = 0;
@@ -229,6 +234,7 @@ public:
   void visit(const ::internal::tflite::op::Conv2D::implicit::Node &node) override;
   void visit(const ::internal::tflite::op::MaxPool2D::implicit::Node &node) override;
   void visit(const ::internal::tflite::op::AvgPool2D::implicit::Node &node) override;
+  void visit(const ::internal::tflite::op::Concat::Node &node) override;
 
 private:
   const ::internal::tflite::operand::Set &_ctx;
@@ -549,6 +555,40 @@ void Planner::visit(const ::internal::tflite::op::AvgPool2D::implicit::Node &nod
   _builder.addStage(stage);
 }
 
+void Planner::visit(const ::internal::tflite::op::Concat::Node &node)
+{
+  const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index};
+
+  // NOTE This implementation assumes that inputs and output are a feature
+  // TODO Remove this assumption
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+
+  // NOTE This implementation assumes concat over feature depth
+  // TODO Remove this assumption
+  assert(_ctx.at(::internal::tflite::operand::Index{node.param().axis_index}).asScala<int32_t>() == 3);
+
+  // Set Shape Constraints (for output)
+  _builder.addShapeConstr(ofm_index, asTensorInfo(ofm_shape));
+
+  // Set Shape Constraints (for input)
+  uint32_t depth = 0;
+
+  for (const auto &index : node.param().ifm_indexes)
+  {
+    const ::internal::tflite::operand::Index ifm_index{index};
+    const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+
+    _builder.addSubsumptionConstr(ifm_index,
+                                  ofm_index,
+                                  ::arm_compute::Coordinates{0, 0, depth, 0},
+                                  asTensorShape(ifm_shape));
+
+    depth += ifm_shape.C;
+  }
+
+  // NOTE Concat has no actual operation!
+}
+
 class AllocationContext final : public IAllocationContext
 {
 public:
@@ -597,6 +637,13 @@ public:
   void addShapeConstr(const ::internal::tflite::operand::Index &ind,
                       const ::arm_compute::TensorInfo &info) override;
 
+
+public:
+  void addSubsumptionConstr(const ::internal::tflite::operand::Index &ind,
+                            const ::internal::tflite::operand::Index &base,
+                            const ::arm_compute::Coordinates &offset,
+                            const ::arm_compute::TensorShape &shape) override;
+
 public:
   void addInitializer(const ::internal::tflite::operand::Index &ind,
                       const Initializer &initializer) override;
@@ -611,7 +658,31 @@ private:
   ::internal::arm_compute::Plan &_plan;
 
 private:
+  struct Subsumption
+  {
+  public:
+    Subsumption(const ::internal::tflite::operand::Index &base,
+                const ::arm_compute::Coordinates &offset,
+                const ::arm_compute::TensorShape &shape)
+      : _base{base}, _offset{offset}, _shape{shape}
+    {
+      // DO NOTHING
+    }
+
+  public:
+    const ::internal::tflite::operand::Index &base(void) const { return _base; }
+    const ::arm_compute::Coordinates &offset(void) const { return _offset; }
+    const ::arm_compute::TensorShape &shape(void) const { return _shape; }
+
+  private:
+    const ::internal::tflite::operand::Index _base;
+    const ::arm_compute::Coordinates _offset;
+    const ::arm_compute::TensorShape _shape;
+  };
+
+private:
   std::map<int, ::arm_compute::TensorInfo> _tensor_info_ctx;
+  std::map<int, std::shared_ptr<Subsumption>> _subsumption_ctx;
   std::map<int, Initializer> _initializer_ctx;
   std::vector<Stage> _stages;
 };
@@ -622,6 +693,14 @@ void PlanBuilder::addShapeConstr(const ::internal::tflite::operand::Index &ind,
   _tensor_info_ctx[ind.asInt()] = info;
 }
 
+void PlanBuilder::addSubsumptionConstr(const ::internal::tflite::operand::Index &ind,
+                                       const ::internal::tflite::operand::Index &base,
+                                       const ::arm_compute::Coordinates &offset,
+                                       const ::arm_compute::TensorShape &shape)
+{
+  _subsumption_ctx[ind.asInt()] = std::make_shared<Subsumption>(base, offset, shape);
+}
+
 void PlanBuilder::addInitializer(const ::internal::tflite::operand::Index &ind,
                                  const Initializer &initializer)
 {
@@ -630,21 +709,96 @@ void PlanBuilder::addInitializer(const ::internal::tflite::operand::Index &ind,
 
 void PlanBuilder::addStage(const Stage &stage) { _stages.emplace_back(stage); }
 
+#include <stack>
+
 void PlanBuilder::finalize(void) const
 {
   // CLTensor objects to be initialized later
   std::vector<std::shared_ptr<::arm_compute::CLTensor>> tensors;
 
-  // Create CLTensor
-  for (auto it = _tensor_info_ctx.begin(); it != _tensor_info_ctx.end(); ++it)
+  // Create CLTensor & CLSubTensor
+  auto isAllocated = [this] (int ind)
+  {
+    const ::internal::tflite::operand::Index operand_index{ind};
+    return _plan.operands().exist(operand_index);
+  };
+
+  auto setCLTensor = [&] (int ind)
   {
     auto tensor = std::make_shared<::arm_compute::CLTensor>();
 
-    tensor->allocator()->init(it->second);
+    tensor->allocator()->init(_tensor_info_ctx.at(ind));
 
     // NOTE Do NOT allocate here. allocate should be invoked after configure functions
-    _plan.operands().set(::internal::tflite::operand::Index{it->first}, tensor);
+    _plan.operands().set(::internal::tflite::operand::Index{ind}, tensor);
     tensors.emplace_back(tensor);
+  };
+
+  auto setCLSubTensor = [&] (int curr)
+  {
+    const auto &sub_info = *(_subsumption_ctx.find(curr)->second);
+
+    auto base_tensor = _plan.operands().at(sub_info.base()).ptr();
+
+    assert(base_tensor != nullptr);
+
+    auto curr_tensor = std::make_shared<::arm_compute::CLSubTensor>(base_tensor,
+                                                                    sub_info.shape(),
+                                                                    sub_info.offset());
+
+    _plan.operands().set(::internal::tflite::operand::Index{curr}, curr_tensor);
+  };
+
+  for (auto it = _subsumption_ctx.begin(); it != _subsumption_ctx.end(); ++it)
+  {
+    std::stack<int> stack;
+
+    stack.push(it->first);
+
+    while (!stack.empty())
+    {
+      const auto curr = stack.top();
+
+      if (isAllocated(curr))
+      {
+        // Skip if already allocated
+        stack.pop();
+        continue;
+      }
+
+      auto it_s = _subsumption_ctx.find(curr);
+
+      if (it_s == _subsumption_ctx.end())
+      {
+        setCLTensor(curr);
+        stack.pop();
+        continue;
+      }
+
+      const auto &sub_info = *(it_s->second);
+
+      if (isAllocated(sub_info.base().asInt()))
+      {
+        setCLSubTensor(curr);
+        stack.pop();
+      }
+      else
+      {
+        // Allocate base tensor first
+        stack.push(sub_info.base().asInt());
+      }
+    }
+  }
+
+  for (auto it = _tensor_info_ctx.begin(); it != _tensor_info_ctx.end(); ++it)
+  {
+    if (isAllocated(it->first))
+    {
+      // Skip if already allocated
+      continue;
+    }
+
+    setCLTensor(it->first);
   }
 
   // Process Stage
diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/arm_compute.h b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/arm_compute.h
index 6221ac4..51ec812 100644
--- a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/arm_compute.h
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/arm_compute.h
@@ -53,6 +53,12 @@ public:
                const std::shared_ptr<::arm_compute::ICLTensor> &tensor);
 
 public:
+  bool exist(const ::internal::tflite::operand::Index &ind) const
+  {
+    return _objects.find(ind.asInt()) != _objects.end();
+  }
+
+public:
   const Object &at(const ::internal::tflite::operand::Index &ind) const
   {
     return _objects.at(ind.asInt());
diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.cc b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.cc
new file mode 100644
index 0000000..0108491
--- /dev/null
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.cc
@@ -0,0 +1,53 @@
+#include "internal/op/Concat.h"
+#include "internal/op/NodeVisitor.h"
+
+#include <cassert>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Concat
+{
+
+void Node::accept(NodeVisitor &&v) const { v.visit(*this); }
+
+} // namespace Concat
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Concat
+{
+
+Param::Param(uint32_t inputCount, const uint32_t* inputs,
+             uint32_t outputCount, const uint32_t* outputs)
+{
+  assert(outputCount == 1);
+
+  ofm_index = outputs[0];
+
+  // When there are N + 1 inputs, each input should be interpreted as follows:
+  //
+  //  [0, N) -> Input tensors
+  //  N -> Axis
+  axis_index = inputs[inputCount - 1];
+  
+  for (uint32_t n = 0; n < inputCount - 1; ++n)
+  {
+    ifm_indexes.emplace_back(inputs[n]);
+  }
+}
+
+} // namespace Concat
+} // namespace op
+} // namespace tflite
+} // namespace internal
diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.h b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.h
new file mode 100644
index 0000000..11c87bb
--- /dev/null
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/Concat.h
@@ -0,0 +1,56 @@
+#ifndef __INTERNAL_OP_CONCAT_H__
+#define __INTERNAL_OP_CONCAT_H__
+
+#include "internal/op/Node.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace Concat
+{
+
+struct Param
+{
+  int32_t ofm_index;
+
+  std::vector<int32_t> ifm_indexes;
+  int32_t axis_index;
+
+  Param() = default;
+  Param(uint32_t inputCount, const uint32_t* inputs,
+        uint32_t outputCount, const uint32_t* outputs);
+};
+
+class Node final : public op::Node
+{
+public:
+  Node(const Param &param) : _param(param)
+  {
+    // DO NOTHING
+  }
+
+public:
+  virtual ~Node() = default;
+
+public:
+  const Param &param(void) const { return _param; }
+
+public:
+  void accept(NodeVisitor &&) const override;
+
+private:
+  const Param _param;
+};
+
+} // namespace Concat
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+#endif // __INTERNAL_OP_CONCAT_H__
diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/NodeVisitor.h b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/NodeVisitor.h
index cd2155b..ba7ee3a 100644
--- a/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/NodeVisitor.h
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/internal/op/NodeVisitor.h
@@ -4,6 +4,7 @@
 #include "internal/op/Conv2D.h"
 #include "internal/op/MaxPool2D.h"
 #include "internal/op/AvgPool2D.h"
+#include "internal/op/Concat.h"
 
 namespace internal
 {
@@ -19,6 +20,7 @@ struct NodeVisitor
   virtual void visit(const Conv2D::implicit::Node &) = 0;
   virtual void visit(const MaxPool2D::implicit::Node &) = 0;
   virtual void visit(const AvgPool2D::implicit::Node &) = 0;
+  virtual void visit(const Concat::Node &) = 0;
 };
 
 } // namespace op
diff --git a/tools/nnapi_bindings/bindings/pure_arm_compute/src/model.cc b/tools/nnapi_bindings/bindings/pure_arm_compute/src/model.cc
index 853845e..37fb6b8 100644
--- a/tools/nnapi_bindings/bindings/pure_arm_compute/src/model.cc
+++ b/tools/nnapi_bindings/bindings/pure_arm_compute/src/model.cc
@@ -158,6 +158,18 @@ ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 
       break;
     }
+    case ANEURALNETWORKS_CONCATENATION:
+    {
+      using internal::tflite::op::Concat::Param;
+      using internal::tflite::op::Concat::Node;
+
+      // Add 'operations'
+      auto &operations = model->deref().operations();
+
+      operations.emplace_back<Node>(Param{inputCount, inputs, outputCount, outputs});
+
+      break;
+    }
     default:
       throw std::runtime_error{"Not supported operation"};
   };
-- 
2.7.4