From 3c3bba4169067a7340ff1d786a6b61282cf26820 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Fri, 27 Aug 2021 01:39:14 -0700
Subject: [PATCH] [Static Runtime] Use F14FastMap/F14FastSet (#63999)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63999

Use folly::F14FastMap/F14FastSet instead of std::unordered_map/unordered_set in the Static Runtime code base. folly::F14FastMap/F14FastSet implements the same APIs as std::unordered_map/unordered_set but faster. For details see https://github.com/facebook/folly/blob/master/folly/container/F14.md

Reviewed By: d1jang

Differential Revision: D30566149

fbshipit-source-id: 20a7fa2519e4dde96fb3fc61ef6c92bf6d759383
---
 torch/csrc/jit/runtime/static/impl.cpp       | 83 ++++++++++++++--------------
 torch/csrc/jit/runtime/static/impl.h         | 33 ++++++++---
 torch/csrc/jit/runtime/static/ops.cpp        |  3 +-
 torch/csrc/jit/runtime/static/te_wrapper.cpp |  5 +-
 4 files changed, 72 insertions(+), 52 deletions(-)
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index cb9342b..b3e1eb1 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -104,8 +104,8 @@ bool mayContainAlias(AliasDb& db, const Value* a, const Value* b) {
 
 bool mayContainAlias(
     AliasDb& db,
-    const std::unordered_set<const Value*>& a,
-    const std::unordered_set<const Value*>& b) {
+    const FastSet<const Value*>& a,
+    const FastSet<const Value*>& b) {
   std::vector<Value*> as;
   std::vector<Value*> bs;
   as.reserve(a.size());
@@ -122,11 +122,11 @@ bool mayContainAlias(
 }
 
 // Get set of all inputs/outputs/constants (always alive) and their aliases
-std::unordered_set<const Value*> GetAlwaysAliveValues(
+FastSet<const Value*> GetAlwaysAliveValues(
     const std::shared_ptr<torch::jit::Graph>& graph,
     AliasDb& db) {
   // a set of Values whose live-range exceed current inference
-  std::unordered_set<const Value*> always_alive;
+  FastSet<const Value*> always_alive;
 
   // mark inputs, constants, outputs as always_alive
   for (const auto* input : graph->inputs()) {
@@ -148,7 +148,7 @@ std::unordered_set<const Value*> GetAlwaysAliveValues(
     // constants are already in the always_alive set
     if (node->kind() != prim::Constant) {
       for (const auto* v : node->outputs()) {
-        if (mayContainAlias(db, ValueSet{v}, always_alive)) {
+        if (mayContainAlias(db, {v}, always_alive)) {
           always_alive.insert(v);
         }
       }
@@ -158,22 +158,22 @@ std::unordered_set<const Value*> GetAlwaysAliveValues(
 }
 
 //  Map each value to all values that are alive at the same time.
-using LivenessMap = std::unordered_map<const Value*, std::set<const Value*>>;
+using LivenessMap = FastMap<const Value*, std::set<const Value*>>;
 
 //  The algorithm does a traversal of the execution graph
 //  while keeping track of the live values.
 LivenessMap GetLivenessMap(
     const std::shared_ptr<torch::jit::Graph>& graph,
-    const std::unordered_set<const Value*>& always_alive,
+    const FastSet<const Value*>& always_alive,
     AliasDb& db) {
   // map a Value to a set of Values that overlap live-ranges with the Value's
-  std::unordered_map<const Value*, std::set<const Value*>> liveness_map;
+  FastMap<const Value*, std::set<const Value*>> liveness_map;
 
   // map Values to its creation order in graph (Note: only traverse top-level
   // nodes such that nodes under control-flows are represented by top-level
   // block nodes)
   std::vector<const Value*> values_in_creation_order;
-  std::unordered_map<const Value*, size_t> values_to_idx_in_creation_order;
+  FastMap<const Value*, size_t> values_to_idx_in_creation_order;
   for (const auto* node : graph->nodes()) {
     for (const auto* v : node->outputs()) {
       values_to_idx_in_creation_order[v] = values_in_creation_order.size();
@@ -184,10 +184,10 @@ LivenessMap GetLivenessMap(
   // presence of a Value in live_values_use_chain means the Value alive
   // Value mapped to set of Nodes that may use the Value (i.e., use-chain of
   // Value)
-  std::unordered_map<const Value*, std::set<const Node*>> live_values_use_chain;
+  FastMap<const Value*, std::set<const Node*>> live_values_use_chain;
   // Node mapped to set of Values that the Node may use (i.e., def-chain of node
   // inputs)
-  std::unordered_map<const Node*, std::set<const Value*>> live_nodes_def_chain;
+  FastMap<const Node*, std::set<const Value*>> live_nodes_def_chain;
 
   // add v to the current liveness_map
   std::function<void(const Value* v)> add_live_value_fn = [&](const Value* v) {
@@ -320,12 +320,12 @@ LivenessMap GetLivenessMap(
 std::pair<std::vector<const Value*>, std::vector<const Value*>>
 GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
   // for determinism
-  std::unordered_set<const Value*> seen_values;
+  FastSet<const Value*> seen_values;
   std::vector<const Value*> all_values;
-  std::unordered_set<const Value*> can_reuse;
+  FastSet<const Value*> can_reuse;
   // values used by unsupported ops (as either inputs or outputs)
   // these need to be removed from "can_reuse" after analyzing all nodes
-  std::unordered_set<const Value*> cannot_reuse;
+  FastSet<const Value*> cannot_reuse;
   for (auto* n : graph->nodes()) {
     bool can_reuse_inputs_outputs = canReuseInputsOutputs(n);
     for (const auto* v : n->inputs()) {
@@ -388,10 +388,9 @@ GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
 //
 // NB: This is a deterministic implementation, which makes it easier to tune
 // and debug.
-std::unordered_map<const Value*, std::vector<const Value*>>
-GenerateSameStorageValues(
+FastMap<const Value*, std::vector<const Value*>> GenerateSameStorageValues(
     const LivenessMap& alive_during,
-    const std::unordered_set<const Value*>& always_alive,
+    const FastSet<const Value*>& always_alive,
     const std::pair<std::vector<const Value*>, std::vector<const Value*>>&
         optimizable,
     AliasDb& db) {
@@ -399,8 +398,7 @@ GenerateSameStorageValues(
   const auto& all_values = optimizable.second;
 
   // map Value* to a set Value* that can share the same storage with it
-  std::unordered_map<const Value*, std::vector<const Value*>>
-      same_storage_values;
+  FastMap<const Value*, std::vector<const Value*>> same_storage_values;
 
   // make new_v and old_v map to the same storage (i.e., add to each other's
   // same_storage_values set)
@@ -589,9 +587,9 @@ StaticModule::StaticModule(
   }
 
   // map Value* to IValue (from inputs or prim::Constant) or null
-  std::unordered_map<Value*, IValue*> value_to_ivalue;
+  FastMap<Value*, IValue*> value_to_ivalue;
   // map Value* to its SSA definition IR
-  std::unordered_map<Value*, DefInfo> value_to_ssa_def;
+  FastMap<Value*, DefInfo> value_to_ssa_def;
 
   // N inputs map to the first N entries in storage
   for (const auto i : c10::irange(graph_->inputs().size())) {
@@ -1165,8 +1163,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
     TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up");
   }
 
-  std::unordered_set<const IValue*> output_ivalues(
-      outputs_.begin(), outputs_.end());
+  FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
   for (const auto n : c10::irange(nodes_.size())) {
     auto& pnode = nodes_[n];
     for (const auto i : c10::irange(pnode.outputs().size())) {
@@ -1202,13 +1199,13 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
 
 static void assign_storage_to_managed_tensors(
     StaticRuntime* runtime,
-    const std::unordered_set<const Value*>& managed_tensor_values,
-    const std::unordered_map<const Value*, std::vector<const Value*>>&
+    const FastSet<const Value*>& managed_tensor_values,
+    const FastMap<const Value*, std::vector<const Value*>>&
         value_to_same_storage_values,
     std::vector<std::pair<size_t, std::vector<at::Tensor*>>>& managed_tensors) {
   // map Value to index to managed_storage, where multiple values can
   // map to the same index (i.e., sharing the same storage)
-  std::unordered_map<const Value*, size_t> value_to_storage_idx;
+  FastMap<const Value*, size_t> value_to_storage_idx;
 
   // Snapshot of the current memory state
   for (auto& pnode : runtime->nodes()) {
@@ -1218,19 +1215,21 @@ static void assign_storage_to_managed_tensors(
       if (managed_tensor_values.count(val)) {
         TORCH_CHECK(ival.isTensor());
         at::Tensor* tensor = &ival.toTensor();
-
-        if (value_to_storage_idx.count(val)) {
-          managed_tensors[value_to_storage_idx[val]].second.emplace_back(
-              tensor);
+        auto f = value_to_storage_idx.find(val);
+        if (f != value_to_storage_idx.end()) {
+          auto storage_idx = f->second;
+          managed_tensors[storage_idx].second.emplace_back(tensor);
         } else {
           auto p =
               std::make_pair<size_t, std::vector<at::Tensor*>>(0, {tensor});
           managed_tensors.emplace_back(std::move(p));
           // first of a group, update the value_to_storage_idx map with the
           // index
-          if (value_to_same_storage_values.count(val)) {
+          auto f = value_to_same_storage_values.find(val);
+          if (f != value_to_same_storage_values.end()) {
             auto storage_idx = managed_tensors.size() - 1;
-            for (const auto* v : value_to_same_storage_values.at(val)) {
+            const auto& same_storage_values = f->second;
+            for (const auto* v : same_storage_values) {
               value_to_storage_idx[v] = storage_idx;
             }
           }
@@ -1242,14 +1241,14 @@ static void assign_storage_to_managed_tensors(
 
 MemoryPlanner::MemoryPlanner(
     StaticRuntime* runtime,
-    const std::unordered_map<const Value*, std::vector<const Value*>>&
+    const FastMap<const Value*, std::vector<const Value*>>&
         value_to_same_storage_values,
-    const std::unordered_set<const Value*>& external_values,
+    const FastSet<const Value*>& external_values,
     bool enable_out_variant,
     bool manage_graph_output_memory) {
   // collect register indices of outputs of ops with out variant
-  std::unordered_set<const Value*> managed_tensor_values;
-  std::unordered_set<const Value*> leaked_values;
+  FastSet<const Value*> managed_tensor_values;
+  FastSet<const Value*> leaked_values;
   if (enable_out_variant) {
     for (ProcessedNode& pnode : runtime->nodes()) {
       if (pnode.has_out_variant()) {
@@ -1260,7 +1259,7 @@ MemoryPlanner::MemoryPlanner(
           }
           // Types are stored in the underlying TorchScript IR
           const auto& type = out_v->type();
-          if (type->cast<TensorType>()) {
+          if (type->castRaw<TensorType>()) {
             managed_tensor_values.insert(out_v);
           } else if (isOptimizableContainerType(pnode.node())) {
             // We "leak" certain container types because their allocations take
@@ -1273,7 +1272,7 @@ MemoryPlanner::MemoryPlanner(
   }
 
   // collect unmanaged output ivalues
-  std::unordered_set<IValue*> unmanaged_ivalues;
+  FastSet<IValue*> unmanaged_ivalues;
   for (ProcessedNode& pnode : runtime->nodes()) {
     for (const auto i : c10::irange(pnode.outputs().size())) {
       // Types are stored in the underlying TorchScript IR
@@ -1295,9 +1294,11 @@ MemoryPlanner::MemoryPlanner(
   }
 
   // copy to unmanaged_ivalues_
-  for (IValue* out : unmanaged_ivalues) {
-    unmanaged_ivalues_.emplace_back(out);
-  }
+  unmanaged_ivalues_.reserve(unmanaged_ivalues.size());
+  unmanaged_ivalues_.insert(
+      unmanaged_ivalues_.begin(),
+      unmanaged_ivalues.begin(),
+      unmanaged_ivalues.end());
 
   if (enable_out_variant) {
     ::torch::jit::assign_storage_to_managed_tensors(
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index b16cfef..6cff047 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -9,9 +9,26 @@
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/inliner.h>
 
+#ifdef FBCODE_CAFFE2
+#include <folly/container/F14Map.h>
+#include <folly/container/F14Set.h>
+#endif
+
 namespace torch {
 namespace jit {
 
+#ifdef FBCODE_CAFFE2
+template <typename Key, typename Value>
+using FastMap = folly::F14FastMap<Key, Value>;
+template <typename Key>
+using FastSet = folly::F14FastSet<Key>;
+#else
+template <typename Key, typename Value>
+using FastMap = std::unordered_map<Key, Value>;
+template <typename Key>
+using FastSet = std::unordered_set<Key>;
+#endif
+
 TORCH_API bool canEnableStaticRuntime(
     const std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -127,7 +144,7 @@ class TORCH_API StaticModule {
   size_t num_inputs() const;
   size_t num_outputs() const;
 
-  const std::unordered_map<int, std::vector<DefInfo>>& index_map() const {
+  const FastMap<int, std::vector<DefInfo>>& index_map() const {
     return node_inputs_ssa_def_map_;
   }
 
@@ -147,12 +164,12 @@ class TORCH_API StaticModule {
     return schema_;
   }
 
-  const std::unordered_map<const Value*, std::vector<const Value*>>&
+  const FastMap<const Value*, std::vector<const Value*>>&
   values_share_same_storage() const {
     return value_to_same_storage_values_;
   }
 
-  const std::unordered_set<const Value*>& external_values() const {
+  const FastSet<const Value*>& external_values() const {
     return external_values_;
   }
 
@@ -178,14 +195,14 @@ class TORCH_API StaticModule {
   // a vector of ssa_defs corresponding to graph->outputs()
   std::vector<DefInfo> output_ssa_defs_;
   // map a node idx (in graph order) to a vector of ssa_defs for node inputs
-  std::unordered_map<int, std::vector<DefInfo>> node_inputs_ssa_def_map_;
+  FastMap<int, std::vector<DefInfo>> node_inputs_ssa_def_map_;
 
   // Bookkeeping for MemoryPlanner in StaticRuntime
   // values whose live-time exceeds that of running one inference (e.g., input,
   // output, prim::Constants, and their aliases)
-  std::unordered_set<const Value*> external_values_;
+  FastSet<const Value*> external_values_;
   // map a value to the set of values that may share the same storage with it
-  std::unordered_map<const Value*, std::vector<const Value*>>
+  FastMap<const Value*, std::vector<const Value*>>
       value_to_same_storage_values_;
 };
 
@@ -323,8 +340,8 @@ class MemoryPlanner {
  public:
   explicit MemoryPlanner(
       StaticRuntime* runtime,
-      const std::unordered_map<const Value*, std::vector<const Value*>>&,
-      const std::unordered_set<const Value*>& external_values,
+      const FastMap<const Value*, std::vector<const Value*>>&,
+      const FastSet<const Value*>& external_values,
       bool enable_out_variant,
       bool manage_graph_output_memory);
   // disable copying and moving
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 484c4b0..54c0456 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -16,6 +16,7 @@
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/te_wrapper.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
@@ -288,7 +289,7 @@ bool disableUnsafeMathOp(const char* op_name) {
   // not guarantee bit exactness vs the jit interpreter. Note aten::relu is not
   // included even though it uses NNC because the results of relu should always
   // match.
-  static const std::unordered_set<std::string> fast_ops{
+  static const FastSet<std::string> fast_ops{
       "aten::add", "aten::tanh", "aten::sigmoid", "aten::logit"};
   return fast_ops.count(op_name) > 0;
 }
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
index d8b494c..acd1fb7 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.cpp
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/CPUFunctions.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
 
 namespace torch {
 namespace jit {
@@ -79,8 +80,8 @@ std::mutex& getNNCCacheMutex() {
   return nncCacheMutex;
 }
 
-std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
-  static std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
+FastMap<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
+  static FastMap<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
   return nncCache;
 }
 
-- 
2.7.4