From bfdab3c2c5cf449f4ec3dcbec005ca2036d9bdca Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=94=D0=B8=D0=BB=D1=88=D0=BE=D0=B4=D0=B6=D0=BE=D0=BD=20?=
 =?utf8?q?=D0=A3=D0=BC=D1=80=D0=BE=D0=BD=D1=85=D0=BE=D0=BD=D0=BE=D0=B2?=
 =?utf8?q?=D0=B8=D1=87=20=D0=9F=D0=BE=D1=88=D1=88=D0=BE=D0=B5=D0=B2/AI=20T?=
 =?utf8?q?ools=20Lab=20/SRR/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <d.poshshoev@samsung.com>
Date: Tue, 16 Jul 2019 19:23:04 +0900
Subject: [PATCH] [neurun] Schedule ops in holes/gaps of a backends (#5557)

So far scheduler assumed a backend is occupied totally and there is no
gaps/holes between two tasks. This change adds this support.
Also, this change adds the concideration of occupation of cpu bywq! permutation

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>
---
 runtimes/neurun/core/include/compiler/Scheduler.h  | 15 ++++-
 .../neurun/core/src/compiler/BackendResolver.h     | 12 ++--
 runtimes/neurun/core/src/compiler/Scheduler.cc     | 74 ++++++++++++++++------
 3 files changed, 76 insertions(+), 25 deletions(-)
diff --git a/runtimes/neurun/core/include/compiler/Scheduler.h b/runtimes/neurun/core/include/compiler/Scheduler.h
index 3adc5ca..42ab3b3 100644
--- a/runtimes/neurun/core/include/compiler/Scheduler.h
+++ b/runtimes/neurun/core/include/compiler/Scheduler.h
@@ -70,6 +70,18 @@ public:
 
 private:
   void makeRank(const graph::Graph &model);
+  /**
+   * @brief   Returns the time, when backend is available for at least given amount of time.
+   *
+   * @note  Returns either hole/gap between two performing two already scheduled operations,
+   *        or the finishing time of the last scheduled operation
+   *
+   * @param[in] backend backend, for which to return the time
+   * @param[in] starting_time time, starting which to look for gap
+   * @param[in] time_amount amound of the time, for which to look gap
+   */
+  int64_t backendAvailableTime(const backend::Backend *backend, const int64_t &starting_time,
+                               const int64_t &time_amount);
 
   int64_t getTime(const backend::Backend *backend, const std::string &operation, bool quant,
                   uint32_t size);
@@ -80,7 +92,8 @@ private:
 
 private:
   std::unordered_map<const backend::Backend *, std::unordered_map<std::string, int>> _run_cache;
-  std::unordered_map<const backend::Backend *, int64_t> _backends_avail_time;
+  // Finishing and starting time of each backend
+  std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
   std::unordered_map<model::OperationIndex, int64_t> _parents_eft;
   std::multimap<int64_t, model::OperationIndex, std::greater<int64_t>> _ranks;
   std::shared_ptr<OpRanks> _indexed_ranks;
diff --git a/runtimes/neurun/core/src/compiler/BackendResolver.h b/runtimes/neurun/core/src/compiler/BackendResolver.h
index 392a204..70f0c7a 100644
--- a/runtimes/neurun/core/src/compiler/BackendResolver.h
+++ b/runtimes/neurun/core/src/compiler/BackendResolver.h
@@ -39,14 +39,16 @@ public:
   }
 
 public:
-  const backend::Backend *getBackend(const std::type_index &type)
+  const backend::Backend *getDefaultBackend() const { return _backend_manager->get("cpu"); }
+
+  const backend::Backend *getBackend(const std::string &id) const
   {
-    return _gen_map_deprecated[type];
+    return _backend_manager->get(id);
   }
-  const backend::Backend *getDefaultBackend() const
+
+  const backend::Backend *getBackend(const std::type_index &type)
   {
-    backend::Backend *default_backend = _backend_manager->get("cpu");
-    return default_backend;
+    return _gen_map_deprecated[type];
   }
 
   const backend::Backend *getBackend(const model::OperationIndex &index)
diff --git a/runtimes/neurun/core/src/compiler/Scheduler.cc b/runtimes/neurun/core/src/compiler/Scheduler.cc
index 91efc29..e4700d9 100644
--- a/runtimes/neurun/core/src/compiler/Scheduler.cc
+++ b/runtimes/neurun/core/src/compiler/Scheduler.cc
@@ -43,7 +43,7 @@ std::unique_ptr<compiler::BackendResolver> Scheduler::schedule(const graph::Grap
   const auto all_backends = _backend_resolver->getAllBackends();
   for (const auto *backend : all_backends)
   {
-    _backends_avail_time.emplace(backend, 0);
+    _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
   }
 
   // for each task select the backend with the smallest earliest finishing time(eft)
@@ -154,8 +154,9 @@ void Scheduler::makeRank(const graph::Graph &model)
           {
             // Permute layer is used for data transfer and it is implemented on cpu backend
             assert(_backend_resolver->getDefaultBackend()->config()->id() == "cpu");
-            transfer_cost = 100; // makes the scheduler prefer keeping computations on one backend
-            _run_cache[backend][other_backend->config()->id()] = 100;
+            // makes the scheduler prefer keeping computations on one backend
+            transfer_cost = operand.info().total_size() / 100;
+            _run_cache[backend][other_backend->config()->id()] = transfer_cost;
           }
           avg_transfer_cost += transfer_cost;
         }
@@ -230,11 +231,28 @@ void Scheduler::makeRank(const graph::Graph &model)
   VERBOSE(Scheduler::makeRank) << "task prioritizing finished" << std::endl;
 }
 
+int64_t Scheduler::backendAvailableTime(const backend::Backend *backend,
+                                        const int64_t &starting_time, const int64_t &time_amount)
+{
+  auto backend_times = _backends_avail_time.at(backend);
+  // finishing and starting times of an op, that will come after current op
+  auto next_op_fst = backend_times.upper_bound(starting_time);
+  // finishing time of an op, that will come before current op
+  auto prev_op_ft = starting_time;
+  // until reach the "hole/gap", that is enough to run this op
+  while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft < time_amount)
+  {
+    prev_op_ft = next_op_fst->first;
+    ++next_op_fst;
+  }
+  return prev_op_ft;
+}
+
 void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIndex &index)
 {
   VERBOSE(Scheduler::scheduleNode) << "scheduling (" << index.value() << ")" << std::endl;
   const auto all_backends = _backend_resolver->getAllBackends();
-  int64_t eft = std::numeric_limits<int64_t>::max();
+  int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
   const auto &node = model.operations().at(index);
   const auto &obj = model.operands().at(node.getInputs().at(0));
   bool quant = false;
@@ -245,11 +263,13 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
   }
 
   auto size = getOperationsFlattenedIOSize(model, node);
-
+  std::vector<std::pair<int64_t, int64_t>> selected_permute_avail_time;
   // select the backend with the smallest eft of this task
   const backend::Backend *chosen_backend = nullptr;
   for (const auto *backend : all_backends)
   {
+    std::vector<std::pair<int64_t, int64_t>> permute_avail_time;
+
     /* TODO: this is workaround, come up with better solution if have.
              Adding exception in stage doesn't help. Because if there is a record for add without
              broadcast, scheduling will select it since it doesn't distinguish broadcast and
@@ -286,12 +306,9 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
     }
     // get average exec time of the op on this backend
     auto exec_time = getTime(backend, node.getName(), quant, size);
-    VERBOSE(Scheduler::scheduleNode) << "expected exec_time of (" << index.value() << ") "
-                                     << node.getName() << " quant==" << quant << " on "
-                                     << backend->config()->id() << " is " << exec_time
-                                     << " microseconds" << std::endl;
     // get max eft of direct (one lavel above) predecessors
     int64_t max_pred_eft = 0;
+    int64_t total_transfer_cost = 0;
     for (const auto &input : node.getInputs())
     {
       const auto &operand = model.operands().at(input);
@@ -301,34 +318,53 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
         // Data transfer cost from this parent's backend to current node's backend:
         auto parent_backend = _backend_resolver->getBackend(defs);
 
-        int64_t transfer_cost;
         if (parent_backend == backend)
         {
-          transfer_cost = 1;
+          max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs));
         }
         else
         {
-          transfer_cost =
+          int64_t transfer_cost =
               getTime(parent_backend, backend->config()->id(), quant, obj.info().total_size());
+          total_transfer_cost += transfer_cost;
+
+          // Assumes that cpu is free and proceeds to permutation right away
+          permute_avail_time.push_back({transfer_cost, _parents_eft.at(defs)});
+          max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs));
         }
-        max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs) + transfer_cost);
       }
     }
-    auto backend_avail_time = _backends_avail_time[backend];
-    // earliest starting time
-    auto est = std::max(backend_avail_time, max_pred_eft);
-    if (eft > est + exec_time)
+    max_pred_eft += total_transfer_cost;
+    // find the hole/gap, where this op can be put or the finishing time of the last assigned op
+    auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
+
+    VERBOSE(Scheduler::scheduleNode)
+        << "expected exec_time of (" << index.value() << ") " << node.getName()
+        << " quant==" << quant << " on " << backend->config()->id() << " is " << exec_time
+        << " microseconds. Backend available time: " << prev_op_ft
+        << " Parent's max eft: " << max_pred_eft - total_transfer_cost
+        << " total transfer cost: " << total_transfer_cost << std::endl;
+    if (eft > prev_op_ft + exec_time)
     {
-      eft = est + exec_time;
+      eft = prev_op_ft + exec_time;
+      selected_exec_time = exec_time;
       chosen_backend = backend;
+      selected_permute_avail_time = permute_avail_time;
     }
   }
+  for (const auto &it : selected_permute_avail_time)
+  {
+    auto prev_op_ft =
+        backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second);
+    _backends_avail_time[_backend_resolver->getBackend("cpu")].insert(
+        {prev_op_ft + it.second, prev_op_ft});
+  }
   VERBOSE(Scheduler::schedule) << "backend for " << node.getName() << " is "
                                << chosen_backend->config()->id() << ". Its eft: " << eft
                                << std::endl;
 
   _parents_eft[index] = eft;
-  _backends_avail_time[chosen_backend] = eft + 1;
+  _backends_avail_time[chosen_backend].insert({eft, eft - selected_exec_time});
   _backend_resolver->setBackend(index, chosen_backend);
 }
 
-- 
2.7.4