From 3c3bba4169067a7340ff1d786a6b61282cf26820 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Fri, 27 Aug 2021 01:39:14 -0700 Subject: [PATCH] [Static Runtime] Use F14FastMap/F14FastSet (#63999) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63999 Use folly::F14FastMap/F14FastSet instead of std::unordered_map/unordered_set in the Static Runtime code base. folly::F14FastMap/F14FastSet implements the same APIs as std::unordered_map/unordered_set but faster. For details see https://github.com/facebook/folly/blob/master/folly/container/F14.md Reviewed By: d1jang Differential Revision: D30566149 fbshipit-source-id: 20a7fa2519e4dde96fb3fc61ef6c92bf6d759383 --- torch/csrc/jit/runtime/static/impl.cpp | 83 ++++++++++++++-------------- torch/csrc/jit/runtime/static/impl.h | 33 ++++++++--- torch/csrc/jit/runtime/static/ops.cpp | 3 +- torch/csrc/jit/runtime/static/te_wrapper.cpp | 5 +- 4 files changed, 72 insertions(+), 52 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index cb9342b..b3e1eb1 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -104,8 +104,8 @@ bool mayContainAlias(AliasDb& db, const Value* a, const Value* b) { bool mayContainAlias( AliasDb& db, - const std::unordered_set& a, - const std::unordered_set& b) { + const FastSet& a, + const FastSet& b) { std::vector as; std::vector bs; as.reserve(a.size()); @@ -122,11 +122,11 @@ bool mayContainAlias( } // Get set of all inputs/outputs/constants (always alive) and their aliases -std::unordered_set GetAlwaysAliveValues( +FastSet GetAlwaysAliveValues( const std::shared_ptr& graph, AliasDb& db) { // a set of Values whose live-range exceed current inference - std::unordered_set always_alive; + FastSet always_alive; // mark inputs, constants, outputs as always_alive for (const auto* input : graph->inputs()) { @@ -148,7 +148,7 @@ std::unordered_set GetAlwaysAliveValues( // constants are already in the always_alive set if (node->kind() != prim::Constant) { for (const auto* v : node->outputs()) { - if (mayContainAlias(db, ValueSet{v}, always_alive)) { + if (mayContainAlias(db, {v}, always_alive)) { always_alive.insert(v); } } @@ -158,22 +158,22 @@ std::unordered_set GetAlwaysAliveValues( } // Map each value to all values that are alive at the same time. -using LivenessMap = std::unordered_map>; +using LivenessMap = FastMap>; // The algorithm does a traversal of the execution graph // while keeping track of the live values. LivenessMap GetLivenessMap( const std::shared_ptr& graph, - const std::unordered_set& always_alive, + const FastSet& always_alive, AliasDb& db) { // map a Value to a set of Values that overlap live-ranges with the Value's - std::unordered_map> liveness_map; + FastMap> liveness_map; // map Values to its creation order in graph (Note: only traverse top-level // nodes such that nodes under control-flows are represented by top-level // block nodes) std::vector values_in_creation_order; - std::unordered_map values_to_idx_in_creation_order; + FastMap values_to_idx_in_creation_order; for (const auto* node : graph->nodes()) { for (const auto* v : node->outputs()) { values_to_idx_in_creation_order[v] = values_in_creation_order.size(); @@ -184,10 +184,10 @@ LivenessMap GetLivenessMap( // presence of a Value in live_values_use_chain means the Value alive // Value mapped to set of Nodes that may use the Value (i.e., use-chain of // Value) - std::unordered_map> live_values_use_chain; + FastMap> live_values_use_chain; // Node mapped to set of Values that the Node may use (i.e., def-chain of node // inputs) - std::unordered_map> live_nodes_def_chain; + FastMap> live_nodes_def_chain; // add v to the current liveness_map std::function add_live_value_fn = [&](const Value* v) { @@ -320,12 +320,12 @@ LivenessMap GetLivenessMap( std::pair, std::vector> GetMemoryPlanningCandidates(const std::shared_ptr& graph) { // for determinism - std::unordered_set seen_values; + FastSet seen_values; std::vector all_values; - std::unordered_set can_reuse; + FastSet can_reuse; // values used by unsupported ops (as either inputs or outputs) // these need to be removed from "can_reuse" after analyzing all nodes - std::unordered_set cannot_reuse; + FastSet cannot_reuse; for (auto* n : graph->nodes()) { bool can_reuse_inputs_outputs = canReuseInputsOutputs(n); for (const auto* v : n->inputs()) { @@ -388,10 +388,9 @@ GetMemoryPlanningCandidates(const std::shared_ptr& graph) { // // NB: This is a deterministic implementation, which makes it easier to tune // and debug. -std::unordered_map> -GenerateSameStorageValues( +FastMap> GenerateSameStorageValues( const LivenessMap& alive_during, - const std::unordered_set& always_alive, + const FastSet& always_alive, const std::pair, std::vector>& optimizable, AliasDb& db) { @@ -399,8 +398,7 @@ GenerateSameStorageValues( const auto& all_values = optimizable.second; // map Value* to a set Value* that can share the same storage with it - std::unordered_map> - same_storage_values; + FastMap> same_storage_values; // make new_v and old_v map to the same storage (i.e., add to each other's // same_storage_values set) @@ -589,9 +587,9 @@ StaticModule::StaticModule( } // map Value* to IValue (from inputs or prim::Constant) or null - std::unordered_map value_to_ivalue; + FastMap value_to_ivalue; // map Value* to its SSA definition IR - std::unordered_map value_to_ssa_def; + FastMap value_to_ssa_def; // N inputs map to the first N entries in storage for (const auto i : c10::irange(graph_->inputs().size())) { @@ -1165,8 +1163,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up"); } - std::unordered_set output_ivalues( - outputs_.begin(), outputs_.end()); + FastSet output_ivalues(outputs_.begin(), outputs_.end()); for (const auto n : c10::irange(nodes_.size())) { auto& pnode = nodes_[n]; for (const auto i : c10::irange(pnode.outputs().size())) { @@ -1202,13 +1199,13 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { static void assign_storage_to_managed_tensors( StaticRuntime* runtime, - const std::unordered_set& managed_tensor_values, - const std::unordered_map>& + const FastSet& managed_tensor_values, + const FastMap>& value_to_same_storage_values, std::vector>>& managed_tensors) { // map Value to index to managed_storage, where multiple values can // map to the same index (i.e., sharing the same storage) - std::unordered_map value_to_storage_idx; + FastMap value_to_storage_idx; // Snapshot of the current memory state for (auto& pnode : runtime->nodes()) { @@ -1218,19 +1215,21 @@ static void assign_storage_to_managed_tensors( if (managed_tensor_values.count(val)) { TORCH_CHECK(ival.isTensor()); at::Tensor* tensor = &ival.toTensor(); - - if (value_to_storage_idx.count(val)) { - managed_tensors[value_to_storage_idx[val]].second.emplace_back( - tensor); + auto f = value_to_storage_idx.find(val); + if (f != value_to_storage_idx.end()) { + auto storage_idx = f->second; + managed_tensors[storage_idx].second.emplace_back(tensor); } else { auto p = std::make_pair>(0, {tensor}); managed_tensors.emplace_back(std::move(p)); // first of a group, update the value_to_storage_idx map with the // index - if (value_to_same_storage_values.count(val)) { + auto f = value_to_same_storage_values.find(val); + if (f != value_to_same_storage_values.end()) { auto storage_idx = managed_tensors.size() - 1; - for (const auto* v : value_to_same_storage_values.at(val)) { + const auto& same_storage_values = f->second; + for (const auto* v : same_storage_values) { value_to_storage_idx[v] = storage_idx; } } @@ -1242,14 +1241,14 @@ static void assign_storage_to_managed_tensors( MemoryPlanner::MemoryPlanner( StaticRuntime* runtime, - const std::unordered_map>& + const FastMap>& value_to_same_storage_values, - const std::unordered_set& external_values, + const FastSet& external_values, bool enable_out_variant, bool manage_graph_output_memory) { // collect register indices of outputs of ops with out variant - std::unordered_set managed_tensor_values; - std::unordered_set leaked_values; + FastSet managed_tensor_values; + FastSet leaked_values; if (enable_out_variant) { for (ProcessedNode& pnode : runtime->nodes()) { if (pnode.has_out_variant()) { @@ -1260,7 +1259,7 @@ MemoryPlanner::MemoryPlanner( } // Types are stored in the underlying TorchScript IR const auto& type = out_v->type(); - if (type->cast()) { + if (type->castRaw()) { managed_tensor_values.insert(out_v); } else if (isOptimizableContainerType(pnode.node())) { // We "leak" certain container types because their allocations take @@ -1273,7 +1272,7 @@ MemoryPlanner::MemoryPlanner( } // collect unmanaged output ivalues - std::unordered_set unmanaged_ivalues; + FastSet unmanaged_ivalues; for (ProcessedNode& pnode : runtime->nodes()) { for (const auto i : c10::irange(pnode.outputs().size())) { // Types are stored in the underlying TorchScript IR @@ -1295,9 +1294,11 @@ MemoryPlanner::MemoryPlanner( } // copy to unmanaged_ivalues_ - for (IValue* out : unmanaged_ivalues) { - unmanaged_ivalues_.emplace_back(out); - } + unmanaged_ivalues_.reserve(unmanaged_ivalues.size()); + unmanaged_ivalues_.insert( + unmanaged_ivalues_.begin(), + unmanaged_ivalues.begin(), + unmanaged_ivalues.end()); if (enable_out_variant) { ::torch::jit::assign_storage_to_managed_tensors( diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index b16cfef..6cff047 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -9,9 +9,26 @@ #include #include +#ifdef FBCODE_CAFFE2 +#include +#include +#endif + namespace torch { namespace jit { +#ifdef FBCODE_CAFFE2 +template +using FastMap = folly::F14FastMap; +template +using FastSet = folly::F14FastSet; +#else +template +using FastMap = std::unordered_map; +template +using FastSet = std::unordered_set; +#endif + TORCH_API bool canEnableStaticRuntime( const std::shared_ptr& graph); @@ -127,7 +144,7 @@ class TORCH_API StaticModule { size_t num_inputs() const; size_t num_outputs() const; - const std::unordered_map>& index_map() const { + const FastMap>& index_map() const { return node_inputs_ssa_def_map_; } @@ -147,12 +164,12 @@ class TORCH_API StaticModule { return schema_; } - const std::unordered_map>& + const FastMap>& values_share_same_storage() const { return value_to_same_storage_values_; } - const std::unordered_set& external_values() const { + const FastSet& external_values() const { return external_values_; } @@ -178,14 +195,14 @@ class TORCH_API StaticModule { // a vector of ssa_defs corresponding to graph->outputs() std::vector output_ssa_defs_; // map a node idx (in graph order) to a vector of ssa_defs for node inputs - std::unordered_map> node_inputs_ssa_def_map_; + FastMap> node_inputs_ssa_def_map_; // Bookkeeping for MemoryPlanner in StaticRuntime // values whose live-time exceeds that of running one inference (e.g., input, // output, prim::Constants, and their aliases) - std::unordered_set external_values_; + FastSet external_values_; // map a value to the set of values that may share the same storage with it - std::unordered_map> + FastMap> value_to_same_storage_values_; }; @@ -323,8 +340,8 @@ class MemoryPlanner { public: explicit MemoryPlanner( StaticRuntime* runtime, - const std::unordered_map>&, - const std::unordered_set& external_values, + const FastMap>&, + const FastSet& external_values, bool enable_out_variant, bool manage_graph_output_memory); // disable copying and moving diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 484c4b0..54c0456 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -288,7 +289,7 @@ bool disableUnsafeMathOp(const char* op_name) { // not guarantee bit exactness vs the jit interpreter. Note aten::relu is not // included even though it uses NNC because the results of relu should always // match. - static const std::unordered_set fast_ops{ + static const FastSet fast_ops{ "aten::add", "aten::tanh", "aten::sigmoid", "aten::logit"}; return fast_ops.count(op_name) > 0; } diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp index d8b494c..acd1fb7 100644 --- a/torch/csrc/jit/runtime/static/te_wrapper.cpp +++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace torch { namespace jit { @@ -79,8 +80,8 @@ std::mutex& getNNCCacheMutex() { return nncCacheMutex; } -std::unordered_map>& getNNCCache() { - static std::unordered_map> nncCache; +FastMap>& getNNCCache() { + static FastMap> nncCache; return nncCache; } -- 2.7.4