From 052584ab08a5a1a4c1a76ae8fd421caddd62b2b6 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <yons.kim@samsung.com>
Date: Mon, 5 Nov 2018 10:12:03 +0900
Subject: [PATCH] [neurun] Apply tensor lifetime-info(use-def) to
 Linear/TensorBuilder (#3409)

* [neurun] Apply tensor lifetime-info(use-def) to memory allocation

Applies tensor lifetime-info(use-def) to memory allocation with revising
Linear and TensorBuilder.
- Introduce vassignTensors(virtually assign tensors) in Linear instead
  of markTensors
- Introduce assign(assign memory virtually)/unassign(unassign memory
  virtually) in TensorBuilder instead of mark

Signed-off-by: Yongseop Kim <yons.kim@samsung.com>

* Fix release build error

* Rename functions properly

Linear::vassignTensors -> planTensors
ITensorBuilder::assign -> notifyFirstUse
ITensorBuilder::unassign -> notifyLastUse

* Fix log

* Fix searching output first than input
---
 .../neurun/src/backend/acl_cl/TensorBuilder.cc     |   9 +-
 runtimes/neurun/src/backend/acl_cl/TensorBuilder.h |   6 +-
 runtimes/neurun/src/backend/cpu/MemoryAllocator.cc |   5 +-
 runtimes/neurun/src/backend/cpu/TensorBuilder.cc   |  58 +++++------
 runtimes/neurun/src/backend/cpu/TensorBuilder.h    |   6 +-
 .../neurun/src/backend/interface/ITensorBuilder.h  |   6 +-
 .../neurun/src/frontend/wrapper/compilation.cc     |   4 +-
 runtimes/neurun/src/linear/Linear.cc               | 108 +++++++++++++++++++--
 runtimes/neurun/src/linear/Linear.h                |   2 +-
 9 files changed, 157 insertions(+), 47 deletions(-)
diff --git a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc
index 262f4f2..c0cc2c8 100644
--- a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc
+++ b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc
@@ -34,14 +34,19 @@ TensorBuilder::TensorBuilder()
   // DO NOTHING
 }
 
-void TensorBuilder::mark(const ::neurun::graph::operand::Index &ind,
-                         const ::arm_compute::TensorInfo &info)
+void TensorBuilder::notifyFirstUse(const graph::operand::Index &ind,
+                                   const ::arm_compute::TensorInfo &info)
 {
   assert(_tensors.size() == 0);
 
   _tensor_info_map.insert({ind, info});
 }
 
+void TensorBuilder::notifyLastUse(const graph::operand::Index &)
+{
+  // DO NOTHING
+}
+
 void TensorBuilder::prepare(void)
 {
   assert(_tensors.size() == 0);
diff --git a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h
index 78a985a..d57cb5c 100644
--- a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h
+++ b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h
@@ -35,8 +35,10 @@ class TensorBuilder : public ITensorBuilder
 public:
   TensorBuilder();
 
-  virtual void mark(const ::neurun::graph::operand::Index &ind,
-                    const ::arm_compute::TensorInfo &info) override;
+  virtual void notifyFirstUse(const graph::operand::Index &,
+                              const ::arm_compute::TensorInfo &) override;
+  virtual void notifyLastUse(const graph::operand::Index &) override;
+
   virtual void prepare(void) override;
   virtual void allocate(void) override;
 
diff --git a/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc b/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc
index 889f94c..b9cc213 100644
--- a/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc
+++ b/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc
@@ -48,9 +48,10 @@ void BumpAllocator::finalize()
 {
   assert(!_base && _pos != 0);
 
-  VERBOSE(BP_ALLOC) << "final position: " << _pos << std::endl;
-
   _base = new uint8_t[_pos];
+
+  VERBOSE(BP_ALLOC) << "final position: " << _pos << std::endl;
+  VERBOSE(BP_ALLOC) << "base pointer: " << static_cast<void *>(_base) << std::endl;
 }
 
 void BumpAllocator::free(const graph::operand::Index &index)
diff --git a/runtimes/neurun/src/backend/cpu/TensorBuilder.cc b/runtimes/neurun/src/backend/cpu/TensorBuilder.cc
index 4bb21b6..c349a67 100644
--- a/runtimes/neurun/src/backend/cpu/TensorBuilder.cc
+++ b/runtimes/neurun/src/backend/cpu/TensorBuilder.cc
@@ -20,6 +20,7 @@
 
 #include "operand/Object.h"
 #include "MemoryAllocator.h"
+#include "logging.h"
 
 namespace neurun
 {
@@ -34,12 +35,28 @@ TensorBuilder::TensorBuilder() : _mem_alloc(std::make_shared<BumpAllocator>())
   // DO NOTHING
 }
 
-void TensorBuilder::mark(const ::neurun::graph::operand::Index &ind,
-                         const ::arm_compute::TensorInfo &info)
+void TensorBuilder::notifyFirstUse(const graph::operand::Index &ind,
+                                   const ::arm_compute::TensorInfo &info)
 {
-  assert(_tensors.size() == 0);
+  assert(_mem_alloc);
 
   _tensor_info_map.insert({ind, info});
+
+  const auto size = info.total_size();
+  auto mem_blk = _mem_alloc->allocate(ind, size);
+  _tensor_mem_map[ind] = mem_blk;
+
+  VERBOSE(CPU_TENSORBUILDER) << "ASSIGN(#" << ind.value() << "): mem_blk[" << mem_blk.offset << ", "
+                             << mem_blk.size << "]" << std::endl;
+}
+
+void TensorBuilder::notifyLastUse(const graph::operand::Index &ind)
+{
+  assert(_mem_alloc);
+
+  _mem_alloc->free(ind);
+
+  VERBOSE(CPU_TENSORBUILDER) << "UNASSIGN(#" << ind.value() << ")" << std::endl;
 }
 
 void TensorBuilder::prepare(void)
@@ -47,26 +64,6 @@ void TensorBuilder::prepare(void)
   assert(_tensors.size() == 0);
   assert(_mem_alloc);
 
-  for (auto &entry : _tensor_info_map)
-  {
-    auto ind = entry.first;
-    const auto &info = entry.second;
-    auto tensor = std::make_shared<operand::Tensor>(info);
-    _tensors[ind] = tensor;
-    // If we do not make tensor here currently, stages would cause segment fault
-
-    const auto size = info.total_size(); // NOTE This size may not be accurate
-    auto mem_blk = _mem_alloc->allocate(ind, size);
-    _tensor_mem_map[ind] = mem_blk;
-  }
-  assert(_tensor_info_map.size() == _tensor_mem_map.size());
-
-  // TODO below code can be moved in TensorBuild::allocate()
-  // if StageGerator was modified like
-  //   from
-  //     fn->configure(ifm_alloc->buffer(), param.ifm_shape, ker_alloc->buffer(), param.ker_shape,
-  //   to
-  //     fn->configure(ifm_alloc, param.ifm_shape, ker_alloc, param.ker_shape,
   _mem_alloc->finalize();
   assert(_mem_alloc->base());
 
@@ -74,15 +71,22 @@ void TensorBuilder::prepare(void)
   {
     auto ind = entry.first;
     auto mem_blk = entry.second;
-    auto &tensor = _tensors[ind];
-    tensor->setBuffer(_mem_alloc->base() + mem_blk.offset);
+    const auto &info = _tensor_info_map[ind];
+
+    uint8_t *buffer = _mem_alloc->base() + mem_blk.offset;
+    auto tensor = std::make_shared<operand::Tensor>(info);
+    tensor->setBuffer(buffer);
+    _tensors[ind] = tensor;
+
+    VERBOSE(CPU_TENSORBUILDER) << "TENSOR(#" << ind.value() << "): " << static_cast<void *>(buffer)
+                               << std::endl;
+
+    // If we do not make tensor here currently, stages would cause segment fault
   }
 }
 
 void TensorBuilder::allocate(void)
 {
-  assert(_tensor_info_map.size() == _tensors.size());
-
   // NOTE For now nothing to do. Allocation is done in prepare stage, which is wrong
 }
 
diff --git a/runtimes/neurun/src/backend/cpu/TensorBuilder.h b/runtimes/neurun/src/backend/cpu/TensorBuilder.h
index 0c126d2..768f6ab 100644
--- a/runtimes/neurun/src/backend/cpu/TensorBuilder.h
+++ b/runtimes/neurun/src/backend/cpu/TensorBuilder.h
@@ -36,8 +36,10 @@ class TensorBuilder : public ITensorBuilder
 public:
   TensorBuilder();
 
-  virtual void mark(const ::neurun::graph::operand::Index &ind,
-                    const ::arm_compute::TensorInfo &info) override;
+  virtual void notifyFirstUse(const graph::operand::Index &,
+                              const ::arm_compute::TensorInfo &) override;
+  virtual void notifyLastUse(const graph::operand::Index &) override;
+
   virtual void prepare(void) override;
   virtual void allocate(void) override;
 
diff --git a/runtimes/neurun/src/backend/interface/ITensorBuilder.h b/runtimes/neurun/src/backend/interface/ITensorBuilder.h
index 9d0a6bb..d968c47 100644
--- a/runtimes/neurun/src/backend/interface/ITensorBuilder.h
+++ b/runtimes/neurun/src/backend/interface/ITensorBuilder.h
@@ -33,8 +33,10 @@ struct ITensorBuilder
   using IterateFunction = std::function<void(const graph::operand::Index &)>;
 
   virtual ~ITensorBuilder(void) = default;
-  virtual void mark(const ::neurun::graph::operand::Index &ind,
-                    const ::arm_compute::TensorInfo &info) = 0;
+
+  virtual void notifyFirstUse(const graph::operand::Index &, const ::arm_compute::TensorInfo &) = 0;
+  virtual void notifyLastUse(const graph::operand::Index &) = 0;
+
   // TODO Add an interface for adding subsumption info
   virtual void prepare(void) = 0;
   virtual void allocate(void) = 0;
diff --git a/runtimes/neurun/src/frontend/wrapper/compilation.cc b/runtimes/neurun/src/frontend/wrapper/compilation.cc
index 4258085..5f448d0 100644
--- a/runtimes/neurun/src/frontend/wrapper/compilation.cc
+++ b/runtimes/neurun/src/frontend/wrapper/compilation.cc
@@ -65,10 +65,10 @@ int ANeuralNetworksCompilation::finish()
 
   neurun::codegen::PlanBuilder plan_builder{plan};
 
-  auto tensor_builders = linear->markTensors();
-
   linear->accept(neurun::codegen::Planner{operands, plan_builder});
 
+  auto tensor_builders = linear->planTensors();
+
   // TODO Add optimization passes
   plan_builder.finalize(tensor_builders);
 
diff --git a/runtimes/neurun/src/linear/Linear.cc b/runtimes/neurun/src/linear/Linear.cc
index 41921a5..162477d 100644
--- a/runtimes/neurun/src/linear/Linear.cc
+++ b/runtimes/neurun/src/linear/Linear.cc
@@ -22,6 +22,8 @@
 #include "backend/interface/IStageGenerator.h"
 #include "internal/Convert.h"
 
+#include "logging.h"
+
 namespace neurun
 {
 namespace linear
@@ -50,23 +52,115 @@ void Linear::accept(graph::operation::NodeVisitor &&visitor) const
   }
 }
 
-backend::TensorBuilderSet Linear::markTensors() const
+backend::TensorBuilderSet Linear::planTensors()
 {
+  using ITensorBuilderPtr = std::shared_ptr<backend::ITensorBuilder>;
+  using FnOnTensorBuilder =
+      std::function<void(const graph::operand::Index &ind, ITensorBuilderPtr)>;
+
+  const auto &operands = _graph.operands();
+  auto iterTensorBuilders = [&operands](const graph::operand::Index &ind, FnOnTensorBuilder fn) {
+    const auto &obj = operands.at(ind);
+    for (auto backend : obj.lower_info()->def_backends())
+    {
+      auto tensor_builder = backend->tensor_builder();
+      fn(ind, tensor_builder);
+    }
+  };
+
   backend::TensorBuilderSet tensor_builders;
 
+  std::unordered_map<graph::operand::Index, uint32_t> uses_map;
+  std::vector<graph::operand::Index> constants;
+
   _graph.operands().iterate(
       [&](const graph::operand::Index &ind, const graph::operand::Object &obj) {
-        for (auto backend : obj.lower_info()->def_backends())
-        {
-          auto tensor_builder = backend->tensor_builder();
-          const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo());
+        uses_map[ind] = obj.getUses().size();
 
-          tensor_builder->mark(ind, info);
+        // If a tensor is a constant, increase the use of the tensor.
+        // It makes the tensor not be dealloced.
+        if (obj.getUsage() == graph::operand::OperandUsage::CONSTANT)
+        {
+          constants.push_back(ind);
+          uses_map[ind]++;
+        }
 
+        // Prepare tensor builders to be returned
+        iterTensorBuilders(ind, [&tensor_builders](const graph::operand::Index &,
+                                                   ITensorBuilderPtr tensor_builder) {
           tensor_builders.insert(tensor_builder);
-        }
+        });
       });
 
+  // If a tensor is model output, increase the use of the tensor.
+  // This aim is same to above one.
+  for (const auto &ind : _graph.getOutputs())
+  {
+    uses_map[ind]++;
+  }
+
+  // Allocate constant operands first
+  VERBOSE(LINEAR) << "TENSORS as CONSTANT" << std::endl;
+  for (const auto &ind : constants)
+  {
+    const auto &obj = operands.at(ind);
+    const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo());
+    iterTensorBuilders(ind,
+                       [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) {
+                         tensor_builder->notifyFirstUse(ind, info);
+                       });
+  }
+
+  // Allocate Model's inputs
+  VERBOSE(LINEAR) << "TENSORS as MODEL INPUT" << std::endl;
+  for (const auto &ind : _graph.getInputs())
+  {
+    const auto &obj = operands.at(ind);
+    const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo());
+    iterTensorBuilders(ind,
+                       [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) {
+                         tensor_builder->notifyFirstUse(ind, info);
+                       });
+  }
+
+  // At each operation,
+  //   1. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+  //   2. Scan DEF of outputs. If the DEF, allocate it
+  VERBOSE(LINEAR) << "TENSORS" << std::endl;
+  for (const auto op : _operations)
+  {
+    for (const auto &ind : op->getOutputs())
+    {
+      const auto &obj = operands.at(ind);
+      if (obj.getDef().size())
+      {
+        const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo());
+        iterTensorBuilders(
+            ind, [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) {
+              tensor_builder->notifyFirstUse(ind, info);
+            });
+      }
+    }
+
+    for (const auto &ind : op->getInputs())
+    {
+      uses_map[ind]--;
+      if (uses_map[ind] == 0)
+      {
+        iterTensorBuilders(ind,
+                           [](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) {
+                             tensor_builder->notifyLastUse(ind);
+                           });
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  // Now, model outputs should be not deallocated
+  for (const auto &ind : _graph.getOutputs())
+    assert(uses_map[ind] > 0);
+#endif
+
   return tensor_builders;
 }
 
diff --git a/runtimes/neurun/src/linear/Linear.h b/runtimes/neurun/src/linear/Linear.h
index 46815e4..16ef785 100644
--- a/runtimes/neurun/src/linear/Linear.h
+++ b/runtimes/neurun/src/linear/Linear.h
@@ -58,7 +58,7 @@ public:
   void accept(graph::operation::NodeVisitor &&visitor) const;
 
   // TODO Should not return TensorBuilderSet
-  backend::TensorBuilderSet markTensors() const;
+  backend::TensorBuilderSet planTensors();
 
 private:
   const graph::Graph &_graph;
-- 
2.7.4