From 0e11454d19e106ba6d5819c1147ca540cbce2943 Mon Sep 17 00:00:00 2001 From: Don Jang Date: Wed, 15 Sep 2021 12:50:22 -0700 Subject: [PATCH] [Static Runtime] Move MemoryPlanner out into memory_planner.cpp (#65011) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/65011 This change moves `MemoryPlanner` out of impl.cpp into memory_planner.cpp. `MemoryPlanner` performs an independent sub-task of static analysis of a graph, and creating memory planning, and allocating/deallocating managed Tensors. This change will reduce merge conflicts as I work on MemoryPlanner more actively for output Tensor support. Test Plan: N/A Reviewed By: mikeiovine Differential Revision: D30883290 fbshipit-source-id: a37570f8d9430224a6987d2190bcf81cf875043d --- tools/build_variables.bzl | 1 + torch/csrc/jit/runtime/static/impl.cpp | 190 +--------------------- torch/csrc/jit/runtime/static/impl.h | 75 +-------- torch/csrc/jit/runtime/static/memory_planner.cpp | 196 +++++++++++++++++++++++ torch/csrc/jit/runtime/static/memory_planner.h | 83 ++++++++++ 5 files changed, 284 insertions(+), 261 deletions(-) create mode 100644 torch/csrc/jit/runtime/static/memory_planner.cpp create mode 100644 torch/csrc/jit/runtime/static/memory_planner.h diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index a139515..a8cb1dc 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -330,6 +330,7 @@ core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/runtime/static/fusion.cpp", "torch/csrc/jit/runtime/static/impl.cpp", + "torch/csrc/jit/runtime/static/memory_planner.cpp", "torch/csrc/jit/runtime/static/native_ops.cpp", "torch/csrc/jit/runtime/static/ops.cpp", "torch/csrc/jit/runtime/static/passes.cpp", diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 19d25d5..8277118 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -758,6 +759,8 @@ StaticRuntime::StaticRuntime(const StaticModule& sm) : static_module_(sm) { } } +StaticRuntime::~StaticRuntime() = default; + std::vector StaticRuntime::operator()( const std::vector& inps) { std::vector stack; @@ -1251,193 +1254,6 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { VLOG(1) << "Finished checking for memory leak"; } -static void assign_storage_to_managed_tensors( - StaticRuntime* runtime, - const FastSet& managed_tensor_values, - const FastMap>& - value_to_same_storage_values, - std::vector>>& managed_tensors) { - // map Value to index to managed_storage, where multiple values can - // map to the same index (i.e., sharing the same storage) - FastMap value_to_storage_idx; - - // Snapshot of the current memory state - for (auto& pnode : runtime->nodes()) { - for (const auto i : c10::irange(pnode.outputs().size())) { - auto& ival = pnode.Output(i); - const auto* val = pnode.node()->outputs()[i]; - if (managed_tensor_values.count(val)) { - TORCH_CHECK(ival.isTensor()); - at::Tensor* tensor = &ival.toTensor(); - auto f = value_to_storage_idx.find(val); - if (f != value_to_storage_idx.end()) { - auto storage_idx = f->second; - managed_tensors[storage_idx].second.emplace_back(tensor); - } else { - auto p = - std::make_pair>(0, {tensor}); - managed_tensors.emplace_back(std::move(p)); - // first of a group, update the value_to_storage_idx map with the - // index - auto f = value_to_same_storage_values.find(val); - if (f != value_to_same_storage_values.end()) { - auto storage_idx = managed_tensors.size() - 1; - const auto& same_storage_values = f->second; - for (const auto* v : same_storage_values) { - value_to_storage_idx[v] = storage_idx; - } - } - } - } - } - } -} - -MemoryPlanner::MemoryPlanner( - StaticRuntime* runtime, - const FastMap>& - value_to_same_storage_values, - const FastSet& external_values, - bool enable_out_variant, - bool manage_graph_output_memory) { - // collect register indices of outputs of ops with out variant - FastSet managed_tensor_values; - FastSet leaked_values; - if (enable_out_variant) { - for (ProcessedNode& pnode : runtime->nodes()) { - if (pnode.has_out_variant()) { - for (const auto i : c10::irange(pnode.outputs().size())) { - const Value* out_v = pnode.node()->outputs()[i]; - if (external_values.count(out_v)) { - continue; - } - // Types are stored in the underlying TorchScript IR - const auto& type = out_v->type(); - if (type->castRaw()) { - managed_tensor_values.insert(out_v); - } else if (runtime->is_optimizable_container_type(pnode.node())) { - // We "leak" certain container types because their allocations - // take a long time - leaked_values.insert(out_v); - } - } - } - } - } - - // collect unmanaged output ivalues - FastSet unmanaged_ivalues; - for (ProcessedNode& pnode : runtime->nodes()) { - for (const auto i : c10::irange(pnode.outputs().size())) { - // Types are stored in the underlying TorchScript IR - const Value* out_v = pnode.node()->outputs()[i]; - if (managed_tensor_values.count(out_v) || leaked_values.count(out_v)) { - continue; - } - IValue& out = pnode.Output(i); - unmanaged_ivalues.insert(&out); - } - } - // since runtime->outputs() escape from run(), remove them from - // managed_tensor_values and from unmanaged_ivalues - for (const Value* output : runtime->graph().outputs()) { - managed_tensor_values.erase(output); - } - for (IValue* output : runtime->outputs()) { - unmanaged_ivalues.erase(output); - } - - // copy to unmanaged_ivalues_ - unmanaged_ivalues_.reserve(unmanaged_ivalues.size()); - unmanaged_ivalues_.insert( - unmanaged_ivalues_.begin(), - unmanaged_ivalues.begin(), - unmanaged_ivalues.end()); - - if (enable_out_variant) { - ::torch::jit::assign_storage_to_managed_tensors( - runtime, - managed_tensor_values, - value_to_same_storage_values, - managed_tensors_); - } -} - -// Don't change the size if it is already aligned, otherwise increase the size -// to make it aligned. -size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) { - // Note: everything below is size_t - return (nbytes + c10::gAlignment - 1) & (~(c10::gAlignment - 1)); -} - -at::DataPtr MemoryPlanner::allocate_buffer(size_t size) { - at::Allocator* allocator = c10::GetCPUCachingAllocator(); - return allocator->allocate(size); -} - -void MemoryPlanner::allocate() { - if (managed_bytes_ == 0) { - return; - } - buffer_ = allocate_buffer(managed_bytes_); - - size_t offset = 0; - uint8_t* start = static_cast(buffer_.get()); - - reused_tensors_ = 0; - for (const auto& ms : managed_tensors_) { - auto tensor_size = ms.first; - if (tensor_size == 0) { - continue; - } - const auto& tensors = ms.second; - DCHECK_LE(offset + tensor_size, managed_bytes_); - void* src = static_cast(start + offset); - - for (auto* tensor : tensors) { - tensor->storage().set_data_ptr_noswap( - at::DataPtr(src, src, nullptr, tensor->device())); - tensor->storage().set_nbytes(tensor_size); - reused_tensors_++; - } - reused_tensors_--; - - offset += tensor_size; - } - DCHECK_EQ(offset, managed_bytes_); -} - -void MemoryPlanner::deallocate() { - managed_bytes_ = 0; - - // free memory used by outputs of ops in out variants - // but keep the TensorImpl and StorageImpl around - for (auto& ms : managed_tensors_) { - const auto& tensors = ms.second; - size_t max = ms.first; - for (auto& tensor : tensors) { - size_t current_size = - compute_aligned_tensor_size(tensor->storage().nbytes()); - tensor->storage().unsafeGetStorageImpl()->reset(); - max = std::max(max, current_size); - } - // Static runtime does not know the size of tensors statically, so we use - // the tensor size from the previous run to allocate tensors for the next - // run (following C2 tradition), exploiting the fact that tensor storage - // size does not have to match that of real tensor size. The following logic - // records the tensor storage size for the next run. - ms.first = max; - managed_bytes_ += max; - } - - // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so - // that the objects pointed to by *iv may be reclaimed by reference counting - for (auto& iv : unmanaged_ivalues_) { - *iv = IValue(); - } - buffer_ = {}; -} - ProcessedNode::ProcessedNode( Node* node, std::vector&& inputs, diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 4b5560f..7bef039 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -216,6 +216,7 @@ class TORCH_API StaticModule { class TORCH_API StaticRuntime { public: explicit StaticRuntime(const StaticModule& sm); + ~StaticRuntime(); std::vector operator()(const std::vector& inps); @@ -323,80 +324,6 @@ class TORCH_API StaticRuntime { std::vector nodes_; }; -/// There are three types of ops in a processed graph in Static Runtime: -/// 1. op with _out variant -/// 2. view producing op -/// 3. tensor producing op (could be replaced with type 1 by adding the _out -/// variant to Static Runtime) -/// In Static Runtime, type 2 ops are replaced with their corespoinding copy -/// versions when enable_out_variant is enabled and become type 1 ops.The memory -/// planner only manages tensors that are outputs of type 1 ops. For type 3, the -/// output tensors are allocated inside the operator and can't be directly -/// managed by memory planner. -/// -/// Memory planner tries to minimize the number of memory allocations by -/// tracking the output tensors of ops with _out variants with unique DataPtr -/// (part of StorageImpl). It tries to do this in several steps: -/// 1. record the max memory usage for each Tensor with unique DataPtr at the -/// end of each iteration -/// 2. in the next iteration, allocate the buffer for the max total usage and -/// compute the offset of each allocation with regard to the single memory -/// buffer, optionally reusing memory. In the first iteration, we rely on -/// the default allocator for memory allocation. -/// 3. free the buffer at the end of each iteration -/// Steps 1 and 3 are handled by `deallocate()`, and step 2 by `allocate()`. -/// Only models with simple output types are supported, i.e. None, Tensor or -/// List/Tuple/Dict of Tensors. Complex output types such as List of Lists are -/// not supported. - -class MemoryPlanner { - public: - explicit MemoryPlanner( - StaticRuntime* runtime, - const FastMap>&, - const FastSet& external_values, - bool enable_out_variant, - bool manage_graph_output_memory); - // disable copying and moving - MemoryPlanner(const MemoryPlanner&) = delete; - MemoryPlanner& operator=(const MemoryPlanner&) = delete; - MemoryPlanner(MemoryPlanner&&) = delete; - MemoryPlanner& operator=(MemoryPlanner&&) = delete; - - void allocate(); - void deallocate(); - - size_t total_managed() const { - return managed_bytes_; - } - size_t total_reused_tensors() const { - return reused_tensors_; - } - - private: - // ivalues created in one run but not managed by MemoryPlanner - std::vector unmanaged_ivalues_; - - // each pair contains the size (in bytes) of data to be allocated - // and a vector of Tensors that should be backed by that same data. - // Thus, if memonger is disabled, all vectors are of size 1. - std::vector>> managed_tensors_; - at::DataPtr buffer_; // allocated each time we call Run() - size_t managed_bytes_{0}; - size_t reused_tensors_{0}; - - // since output tensors are alive after one inference, their storage - // is managed differently (e.g., deallocation happens at client side) - // std::vector>> - // managed_output_storage_; - // size_t managed_output_bytes_{0}; - // size_t reused_output_tensors_{0}; - // at::DataPtr output_buffer_; // allocated each time we call Run() - - static size_t compute_aligned_tensor_size(size_t nbytes); - static at::DataPtr allocate_buffer(size_t size); -}; - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class TORCH_API ProcessedNode { public: diff --git a/torch/csrc/jit/runtime/static/memory_planner.cpp b/torch/csrc/jit/runtime/static/memory_planner.cpp new file mode 100644 index 0000000..4c51970 --- /dev/null +++ b/torch/csrc/jit/runtime/static/memory_planner.cpp @@ -0,0 +1,196 @@ +#include + +#include + +namespace torch { +namespace jit { + +static void assign_storage_to_managed_tensors( + StaticRuntime* runtime, + const FastSet& managed_tensor_values, + const FastMap>& + value_to_same_storage_values, + std::vector>>& managed_tensors) { + // map Value to index to managed_storage, where multiple values can + // map to the same index (i.e., sharing the same storage) + FastMap value_to_storage_idx; + + // Snapshot of the current memory state + for (auto& pnode : runtime->nodes()) { + for (const auto i : c10::irange(pnode.outputs().size())) { + auto& ival = pnode.Output(i); + const auto* val = pnode.node()->outputs()[i]; + if (managed_tensor_values.count(val)) { + TORCH_CHECK(ival.isTensor()); + at::Tensor* tensor = &ival.toTensor(); + auto f = value_to_storage_idx.find(val); + if (f != value_to_storage_idx.end()) { + auto storage_idx = f->second; + managed_tensors[storage_idx].second.emplace_back(tensor); + } else { + auto p = + std::make_pair>(0, {tensor}); + managed_tensors.emplace_back(std::move(p)); + // first of a group, update the value_to_storage_idx map with the + // index + auto f = value_to_same_storage_values.find(val); + if (f != value_to_same_storage_values.end()) { + auto storage_idx = managed_tensors.size() - 1; + const auto& same_storage_values = f->second; + for (const auto* v : same_storage_values) { + value_to_storage_idx[v] = storage_idx; + } + } + } + } + } + } +} + +MemoryPlanner::MemoryPlanner( + StaticRuntime* runtime, + const FastMap>& + value_to_same_storage_values, + const FastSet& external_values, + bool enable_out_variant, + bool manage_graph_output_memory) { + // collect register indices of outputs of ops with out variant + FastSet managed_tensor_values; + FastSet leaked_values; + if (enable_out_variant) { + for (ProcessedNode& pnode : runtime->nodes()) { + if (pnode.has_out_variant()) { + for (const auto i : c10::irange(pnode.outputs().size())) { + const Value* out_v = pnode.node()->outputs()[i]; + if (external_values.count(out_v)) { + continue; + } + // Types are stored in the underlying TorchScript IR + const auto& type = out_v->type(); + if (type->castRaw()) { + managed_tensor_values.insert(out_v); + } else if (runtime->is_optimizable_container_type(pnode.node())) { + // We "leak" certain container types because their allocations + // take a long time + leaked_values.insert(out_v); + } + } + } + } + } + + // collect unmanaged output ivalues + FastSet unmanaged_ivalues; + for (ProcessedNode& pnode : runtime->nodes()) { + for (const auto i : c10::irange(pnode.outputs().size())) { + // Types are stored in the underlying TorchScript IR + const Value* out_v = pnode.node()->outputs()[i]; + if (managed_tensor_values.count(out_v) || leaked_values.count(out_v)) { + continue; + } + IValue& out = pnode.Output(i); + unmanaged_ivalues.insert(&out); + } + } + // since runtime->outputs() escape from run(), remove them from + // managed_tensor_values and from unmanaged_ivalues + for (const Value* output : runtime->graph().outputs()) { + managed_tensor_values.erase(output); + } + for (IValue* output : runtime->outputs()) { + unmanaged_ivalues.erase(output); + } + + // copy to unmanaged_ivalues_ + unmanaged_ivalues_.reserve(unmanaged_ivalues.size()); + unmanaged_ivalues_.insert( + unmanaged_ivalues_.begin(), + unmanaged_ivalues.begin(), + unmanaged_ivalues.end()); + + if (enable_out_variant) { + ::torch::jit::assign_storage_to_managed_tensors( + runtime, + managed_tensor_values, + value_to_same_storage_values, + managed_tensors_); + } +} + +// Don't change the size if it is already aligned, otherwise increase the size +// to make it aligned. +size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) { + // Note: everything below is size_t + return (nbytes + c10::gAlignment - 1) & (~(c10::gAlignment - 1)); +} + +at::DataPtr MemoryPlanner::allocate_buffer(size_t size) { + at::Allocator* allocator = c10::GetCPUCachingAllocator(); + return allocator->allocate(size); +} + +void MemoryPlanner::allocate() { + if (managed_bytes_ == 0) { + return; + } + buffer_ = allocate_buffer(managed_bytes_); + + size_t offset = 0; + uint8_t* start = static_cast(buffer_.get()); + + reused_tensors_ = 0; + for (const auto& ms : managed_tensors_) { + auto tensor_size = ms.first; + if (tensor_size == 0) { + continue; + } + const auto& tensors = ms.second; + DCHECK_LE(offset + tensor_size, managed_bytes_); + void* src = static_cast(start + offset); + + for (auto* tensor : tensors) { + tensor->storage().set_data_ptr_noswap( + at::DataPtr(src, src, nullptr, tensor->device())); + tensor->storage().set_nbytes(tensor_size); + reused_tensors_++; + } + reused_tensors_--; + + offset += tensor_size; + } + DCHECK_EQ(offset, managed_bytes_); +} + +void MemoryPlanner::deallocate() { + managed_bytes_ = 0; + + // free memory used by outputs of ops in out variants + // but keep the TensorImpl and StorageImpl around + for (auto& ms : managed_tensors_) { + const auto& tensors = ms.second; + size_t max = ms.first; + for (auto& tensor : tensors) { + size_t current_size = + compute_aligned_tensor_size(tensor->storage().nbytes()); + tensor->storage().unsafeGetStorageImpl()->reset(); + max = std::max(max, current_size); + } + // Static runtime does not know the size of tensors statically, so we use + // the tensor size from the previous run to allocate tensors for the next + // run (following C2 tradition), exploiting the fact that tensor storage + // size does not have to match that of real tensor size. The following logic + // records the tensor storage size for the next run. + ms.first = max; + managed_bytes_ += max; + } + + // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so + // that the objects pointed to by *iv may be reclaimed by reference counting + for (auto& iv : unmanaged_ivalues_) { + *iv = IValue(); + } + buffer_ = {}; +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/memory_planner.h b/torch/csrc/jit/runtime/static/memory_planner.h new file mode 100644 index 0000000..3e6e07c --- /dev/null +++ b/torch/csrc/jit/runtime/static/memory_planner.h @@ -0,0 +1,83 @@ +#pragma once + +#include + +namespace torch { +namespace jit { + +/// There are three types of ops in a processed graph in Static Runtime: +/// 1. op with _out variant +/// 2. view producing op +/// 3. tensor producing op (could be replaced with type 1 by adding the _out +/// variant to Static Runtime) +/// In Static Runtime, type 2 ops are replaced with their corespoinding copy +/// versions when enable_out_variant is enabled and become type 1 ops.The memory +/// planner only manages tensors that are outputs of type 1 ops. For type 3, the +/// output tensors are allocated inside the operator and can't be directly +/// managed by memory planner. +/// +/// Memory planner tries to minimize the number of memory allocations by +/// tracking the output tensors of ops with _out variants with unique DataPtr +/// (part of StorageImpl). It tries to do this in several steps: +/// 1. record the max memory usage for each Tensor with unique DataPtr at the +/// end of each iteration +/// 2. in the next iteration, allocate the buffer for the max total usage and +/// compute the offset of each allocation with regard to the single memory +/// buffer, optionally reusing memory. In the first iteration, we rely on +/// the default allocator for memory allocation. +/// 3. free the buffer at the end of each iteration +/// Steps 1 and 3 are handled by `deallocate()`, and step 2 by `allocate()`. +/// Only models with simple output types are supported, i.e. None, Tensor or +/// List/Tuple/Dict of Tensors. Complex output types such as List of Lists are +/// not supported. + +class MemoryPlanner { + public: + explicit MemoryPlanner( + StaticRuntime* runtime, + const FastMap>&, + const FastSet& external_values, + bool enable_out_variant, + bool manage_graph_output_memory); + // disable copying and moving + MemoryPlanner(const MemoryPlanner&) = delete; + MemoryPlanner& operator=(const MemoryPlanner&) = delete; + MemoryPlanner(MemoryPlanner&&) = delete; + MemoryPlanner& operator=(MemoryPlanner&&) = delete; + + void allocate(); + void deallocate(); + + size_t total_managed() const { + return managed_bytes_; + } + size_t total_reused_tensors() const { + return reused_tensors_; + } + + private: + // ivalues created in one run but not managed by MemoryPlanner + std::vector unmanaged_ivalues_; + + // each pair contains the size (in bytes) of data to be allocated + // and a vector of Tensors that should be backed by that same data. + // Thus, if memonger is disabled, all vectors are of size 1. + std::vector>> managed_tensors_; + at::DataPtr buffer_; // allocated each time we call Run() + size_t managed_bytes_{0}; + size_t reused_tensors_{0}; + + // since output tensors are alive after one inference, their storage + // is managed differently (e.g., deallocation happens at client side) + // std::vector>> + // managed_output_storage_; + // size_t managed_output_bytes_{0}; + // size_t reused_output_tensors_{0}; + // at::DataPtr output_buffer_; // allocated each time we call Run() + + static size_t compute_aligned_tensor_size(size_t nbytes); + static at::DataPtr allocate_buffer(size_t size); +}; + +} // namespace jit +} // namespace torch -- 2.7.4