From 052584ab08a5a1a4c1a76ae8fd421caddd62b2b6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?= =?utf8?q?=EC=9E=90?= Date: Mon, 5 Nov 2018 10:12:03 +0900 Subject: [PATCH] [neurun] Apply tensor lifetime-info(use-def) to Linear/TensorBuilder (#3409) * [neurun] Apply tensor lifetime-info(use-def) to memory allocation Applies tensor lifetime-info(use-def) to memory allocation with revising Linear and TensorBuilder. - Introduce vassignTensors(virtually assign tensors) in Linear instead of markTensors - Introduce assign(assign memory virtually)/unassign(unassign memory virtually) in TensorBuilder instead of mark Signed-off-by: Yongseop Kim * Fix release build error * Rename functions properly Linear::vassignTensors -> planTensors ITensorBuilder::assign -> notifyFirstUse ITensorBuilder::unassign -> notifyLastUse * Fix log * Fix searching output first than input --- .../neurun/src/backend/acl_cl/TensorBuilder.cc | 9 +- runtimes/neurun/src/backend/acl_cl/TensorBuilder.h | 6 +- runtimes/neurun/src/backend/cpu/MemoryAllocator.cc | 5 +- runtimes/neurun/src/backend/cpu/TensorBuilder.cc | 58 +++++------ runtimes/neurun/src/backend/cpu/TensorBuilder.h | 6 +- .../neurun/src/backend/interface/ITensorBuilder.h | 6 +- .../neurun/src/frontend/wrapper/compilation.cc | 4 +- runtimes/neurun/src/linear/Linear.cc | 108 +++++++++++++++++++-- runtimes/neurun/src/linear/Linear.h | 2 +- 9 files changed, 157 insertions(+), 47 deletions(-) diff --git a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc index 262f4f2..c0cc2c8 100644 --- a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc +++ b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.cc @@ -34,14 +34,19 @@ TensorBuilder::TensorBuilder() // DO NOTHING } -void TensorBuilder::mark(const ::neurun::graph::operand::Index &ind, - const ::arm_compute::TensorInfo &info) +void TensorBuilder::notifyFirstUse(const graph::operand::Index &ind, + const ::arm_compute::TensorInfo &info) { assert(_tensors.size() == 0); _tensor_info_map.insert({ind, info}); } +void TensorBuilder::notifyLastUse(const graph::operand::Index &) +{ + // DO NOTHING +} + void TensorBuilder::prepare(void) { assert(_tensors.size() == 0); diff --git a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h index 78a985a..d57cb5c 100644 --- a/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h +++ b/runtimes/neurun/src/backend/acl_cl/TensorBuilder.h @@ -35,8 +35,10 @@ class TensorBuilder : public ITensorBuilder public: TensorBuilder(); - virtual void mark(const ::neurun::graph::operand::Index &ind, - const ::arm_compute::TensorInfo &info) override; + virtual void notifyFirstUse(const graph::operand::Index &, + const ::arm_compute::TensorInfo &) override; + virtual void notifyLastUse(const graph::operand::Index &) override; + virtual void prepare(void) override; virtual void allocate(void) override; diff --git a/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc b/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc index 889f94c..b9cc213 100644 --- a/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc +++ b/runtimes/neurun/src/backend/cpu/MemoryAllocator.cc @@ -48,9 +48,10 @@ void BumpAllocator::finalize() { assert(!_base && _pos != 0); - VERBOSE(BP_ALLOC) << "final position: " << _pos << std::endl; - _base = new uint8_t[_pos]; + + VERBOSE(BP_ALLOC) << "final position: " << _pos << std::endl; + VERBOSE(BP_ALLOC) << "base pointer: " << static_cast(_base) << std::endl; } void BumpAllocator::free(const graph::operand::Index &index) diff --git a/runtimes/neurun/src/backend/cpu/TensorBuilder.cc b/runtimes/neurun/src/backend/cpu/TensorBuilder.cc index 4bb21b6..c349a67 100644 --- a/runtimes/neurun/src/backend/cpu/TensorBuilder.cc +++ b/runtimes/neurun/src/backend/cpu/TensorBuilder.cc @@ -20,6 +20,7 @@ #include "operand/Object.h" #include "MemoryAllocator.h" +#include "logging.h" namespace neurun { @@ -34,12 +35,28 @@ TensorBuilder::TensorBuilder() : _mem_alloc(std::make_shared()) // DO NOTHING } -void TensorBuilder::mark(const ::neurun::graph::operand::Index &ind, - const ::arm_compute::TensorInfo &info) +void TensorBuilder::notifyFirstUse(const graph::operand::Index &ind, + const ::arm_compute::TensorInfo &info) { - assert(_tensors.size() == 0); + assert(_mem_alloc); _tensor_info_map.insert({ind, info}); + + const auto size = info.total_size(); + auto mem_blk = _mem_alloc->allocate(ind, size); + _tensor_mem_map[ind] = mem_blk; + + VERBOSE(CPU_TENSORBUILDER) << "ASSIGN(#" << ind.value() << "): mem_blk[" << mem_blk.offset << ", " + << mem_blk.size << "]" << std::endl; +} + +void TensorBuilder::notifyLastUse(const graph::operand::Index &ind) +{ + assert(_mem_alloc); + + _mem_alloc->free(ind); + + VERBOSE(CPU_TENSORBUILDER) << "UNASSIGN(#" << ind.value() << ")" << std::endl; } void TensorBuilder::prepare(void) @@ -47,26 +64,6 @@ void TensorBuilder::prepare(void) assert(_tensors.size() == 0); assert(_mem_alloc); - for (auto &entry : _tensor_info_map) - { - auto ind = entry.first; - const auto &info = entry.second; - auto tensor = std::make_shared(info); - _tensors[ind] = tensor; - // If we do not make tensor here currently, stages would cause segment fault - - const auto size = info.total_size(); // NOTE This size may not be accurate - auto mem_blk = _mem_alloc->allocate(ind, size); - _tensor_mem_map[ind] = mem_blk; - } - assert(_tensor_info_map.size() == _tensor_mem_map.size()); - - // TODO below code can be moved in TensorBuild::allocate() - // if StageGerator was modified like - // from - // fn->configure(ifm_alloc->buffer(), param.ifm_shape, ker_alloc->buffer(), param.ker_shape, - // to - // fn->configure(ifm_alloc, param.ifm_shape, ker_alloc, param.ker_shape, _mem_alloc->finalize(); assert(_mem_alloc->base()); @@ -74,15 +71,22 @@ void TensorBuilder::prepare(void) { auto ind = entry.first; auto mem_blk = entry.second; - auto &tensor = _tensors[ind]; - tensor->setBuffer(_mem_alloc->base() + mem_blk.offset); + const auto &info = _tensor_info_map[ind]; + + uint8_t *buffer = _mem_alloc->base() + mem_blk.offset; + auto tensor = std::make_shared(info); + tensor->setBuffer(buffer); + _tensors[ind] = tensor; + + VERBOSE(CPU_TENSORBUILDER) << "TENSOR(#" << ind.value() << "): " << static_cast(buffer) + << std::endl; + + // If we do not make tensor here currently, stages would cause segment fault } } void TensorBuilder::allocate(void) { - assert(_tensor_info_map.size() == _tensors.size()); - // NOTE For now nothing to do. Allocation is done in prepare stage, which is wrong } diff --git a/runtimes/neurun/src/backend/cpu/TensorBuilder.h b/runtimes/neurun/src/backend/cpu/TensorBuilder.h index 0c126d2..768f6ab 100644 --- a/runtimes/neurun/src/backend/cpu/TensorBuilder.h +++ b/runtimes/neurun/src/backend/cpu/TensorBuilder.h @@ -36,8 +36,10 @@ class TensorBuilder : public ITensorBuilder public: TensorBuilder(); - virtual void mark(const ::neurun::graph::operand::Index &ind, - const ::arm_compute::TensorInfo &info) override; + virtual void notifyFirstUse(const graph::operand::Index &, + const ::arm_compute::TensorInfo &) override; + virtual void notifyLastUse(const graph::operand::Index &) override; + virtual void prepare(void) override; virtual void allocate(void) override; diff --git a/runtimes/neurun/src/backend/interface/ITensorBuilder.h b/runtimes/neurun/src/backend/interface/ITensorBuilder.h index 9d0a6bb..d968c47 100644 --- a/runtimes/neurun/src/backend/interface/ITensorBuilder.h +++ b/runtimes/neurun/src/backend/interface/ITensorBuilder.h @@ -33,8 +33,10 @@ struct ITensorBuilder using IterateFunction = std::function; virtual ~ITensorBuilder(void) = default; - virtual void mark(const ::neurun::graph::operand::Index &ind, - const ::arm_compute::TensorInfo &info) = 0; + + virtual void notifyFirstUse(const graph::operand::Index &, const ::arm_compute::TensorInfo &) = 0; + virtual void notifyLastUse(const graph::operand::Index &) = 0; + // TODO Add an interface for adding subsumption info virtual void prepare(void) = 0; virtual void allocate(void) = 0; diff --git a/runtimes/neurun/src/frontend/wrapper/compilation.cc b/runtimes/neurun/src/frontend/wrapper/compilation.cc index 4258085..5f448d0 100644 --- a/runtimes/neurun/src/frontend/wrapper/compilation.cc +++ b/runtimes/neurun/src/frontend/wrapper/compilation.cc @@ -65,10 +65,10 @@ int ANeuralNetworksCompilation::finish() neurun::codegen::PlanBuilder plan_builder{plan}; - auto tensor_builders = linear->markTensors(); - linear->accept(neurun::codegen::Planner{operands, plan_builder}); + auto tensor_builders = linear->planTensors(); + // TODO Add optimization passes plan_builder.finalize(tensor_builders); diff --git a/runtimes/neurun/src/linear/Linear.cc b/runtimes/neurun/src/linear/Linear.cc index 41921a5..162477d 100644 --- a/runtimes/neurun/src/linear/Linear.cc +++ b/runtimes/neurun/src/linear/Linear.cc @@ -22,6 +22,8 @@ #include "backend/interface/IStageGenerator.h" #include "internal/Convert.h" +#include "logging.h" + namespace neurun { namespace linear @@ -50,23 +52,115 @@ void Linear::accept(graph::operation::NodeVisitor &&visitor) const } } -backend::TensorBuilderSet Linear::markTensors() const +backend::TensorBuilderSet Linear::planTensors() { + using ITensorBuilderPtr = std::shared_ptr; + using FnOnTensorBuilder = + std::function; + + const auto &operands = _graph.operands(); + auto iterTensorBuilders = [&operands](const graph::operand::Index &ind, FnOnTensorBuilder fn) { + const auto &obj = operands.at(ind); + for (auto backend : obj.lower_info()->def_backends()) + { + auto tensor_builder = backend->tensor_builder(); + fn(ind, tensor_builder); + } + }; + backend::TensorBuilderSet tensor_builders; + std::unordered_map uses_map; + std::vector constants; + _graph.operands().iterate( [&](const graph::operand::Index &ind, const graph::operand::Object &obj) { - for (auto backend : obj.lower_info()->def_backends()) - { - auto tensor_builder = backend->tensor_builder(); - const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo()); + uses_map[ind] = obj.getUses().size(); - tensor_builder->mark(ind, info); + // If a tensor is a constant, increase the use of the tensor. + // It makes the tensor not be dealloced. + if (obj.getUsage() == graph::operand::OperandUsage::CONSTANT) + { + constants.push_back(ind); + uses_map[ind]++; + } + // Prepare tensor builders to be returned + iterTensorBuilders(ind, [&tensor_builders](const graph::operand::Index &, + ITensorBuilderPtr tensor_builder) { tensor_builders.insert(tensor_builder); - } + }); }); + // If a tensor is model output, increase the use of the tensor. + // This aim is same to above one. + for (const auto &ind : _graph.getOutputs()) + { + uses_map[ind]++; + } + + // Allocate constant operands first + VERBOSE(LINEAR) << "TENSORS as CONSTANT" << std::endl; + for (const auto &ind : constants) + { + const auto &obj = operands.at(ind); + const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo()); + iterTensorBuilders(ind, + [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) { + tensor_builder->notifyFirstUse(ind, info); + }); + } + + // Allocate Model's inputs + VERBOSE(LINEAR) << "TENSORS as MODEL INPUT" << std::endl; + for (const auto &ind : _graph.getInputs()) + { + const auto &obj = operands.at(ind); + const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo()); + iterTensorBuilders(ind, + [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) { + tensor_builder->notifyFirstUse(ind, info); + }); + } + + // At each operation, + // 1. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 + // 2. Scan DEF of outputs. If the DEF, allocate it + VERBOSE(LINEAR) << "TENSORS" << std::endl; + for (const auto op : _operations) + { + for (const auto &ind : op->getOutputs()) + { + const auto &obj = operands.at(ind); + if (obj.getDef().size()) + { + const auto info = ::internal::asTensorInfo(obj.shape(), obj.typeInfo()); + iterTensorBuilders( + ind, [&info](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) { + tensor_builder->notifyFirstUse(ind, info); + }); + } + } + + for (const auto &ind : op->getInputs()) + { + uses_map[ind]--; + if (uses_map[ind] == 0) + { + iterTensorBuilders(ind, + [](const graph::operand::Index &ind, ITensorBuilderPtr tensor_builder) { + tensor_builder->notifyLastUse(ind); + }); + } + } + } + +#ifndef NDEBUG + // Now, model outputs should be not deallocated + for (const auto &ind : _graph.getOutputs()) + assert(uses_map[ind] > 0); +#endif + return tensor_builders; } diff --git a/runtimes/neurun/src/linear/Linear.h b/runtimes/neurun/src/linear/Linear.h index 46815e4..16ef785 100644 --- a/runtimes/neurun/src/linear/Linear.h +++ b/runtimes/neurun/src/linear/Linear.h @@ -58,7 +58,7 @@ public: void accept(graph::operation::NodeVisitor &&visitor) const; // TODO Should not return TensorBuilderSet - backend::TensorBuilderSet markTensors() const; + backend::TensorBuilderSet planTensors(); private: const graph::Graph &_graph; -- 2.7.4