[neurun] Schedule ops in holes/gaps of a backends (#5557)

author Дилшоджон Умронхонович Пошшоев/AI Tools Lab /SRR/Engineer/삼성전자 <d.poshshoev@samsung.com>

Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)

committer 이한종/On-Device Lab(SR)/Engineer/삼성전자 <hanjoung.lee@samsung.com>

Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)
author Дилшоджон Умронхонович Пошшоев/AI Tools Lab /SRR/Engineer/삼성전자 <d.poshshoev@samsung.com>
Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)
committer 이한종/On-Device Lab(SR)/Engineer/삼성전자 <hanjoung.lee@samsung.com>
Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)
diff --git a/runtimes/neurun/core/include/compiler/Scheduler.h b/runtimes/neurun/core/include/compiler/Scheduler.h

index 3adc5ca..42ab3b3 100644 (file)
--- a/runtimes/neurun/core/include/compiler/Scheduler.h
+++ b/runtimes/neurun/core/include/compiler/Scheduler.h
@@ -70,6 +70,18 @@ public:
  
  private:
    void makeRank(const graph::Graph &model);
+  /**
+   * @brief   Returns the time, when backend is available for at least given amount of time.
+   *
+   * @note  Returns either hole/gap between two performing two already scheduled operations,
+   *        or the finishing time of the last scheduled operation
+   *
+   * @param[in] backend backend, for which to return the time
+   * @param[in] starting_time time, starting which to look for gap
+   * @param[in] time_amount amound of the time, for which to look gap
+   */
+  int64_t backendAvailableTime(const backend::Backend *backend, const int64_t &starting_time,
+                               const int64_t &time_amount);
  
    int64_t getTime(const backend::Backend *backend, const std::string &operation, bool quant,
                    uint32_t size);
@@ -80,7 +92,8 @@ private:
  
  private:
    std::unordered_map<const backend::Backend *, std::unordered_map<std::string, int>> _run_cache;
-  std::unordered_map<const backend::Backend *, int64_t> _backends_avail_time;
+  // Finishing and starting time of each backend
+  std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
    std::unordered_map<model::OperationIndex, int64_t> _parents_eft;
    std::multimap<int64_t, model::OperationIndex, std::greater<int64_t>> _ranks;
    std::shared_ptr<OpRanks> _indexed_ranks;
diff --git a/runtimes/neurun/core/src/compiler/BackendResolver.h b/runtimes/neurun/core/src/compiler/BackendResolver.h

index 392a204..70f0c7a 100644 (file)
--- a/runtimes/neurun/core/src/compiler/BackendResolver.h
+++ b/runtimes/neurun/core/src/compiler/BackendResolver.h
@@ -39,14 +39,16 @@ public:
    }
  
  public:
-  const backend::Backend *getBackend(const std::type_index &type)
+  const backend::Backend *getDefaultBackend() const { return _backend_manager->get("cpu"); }
+
+  const backend::Backend *getBackend(const std::string &id) const
    {
-    return _gen_map_deprecated[type];
+    return _backend_manager->get(id);
    }
-  const backend::Backend *getDefaultBackend() const
+
+  const backend::Backend *getBackend(const std::type_index &type)
    {
-    backend::Backend *default_backend = _backend_manager->get("cpu");
-    return default_backend;
+    return _gen_map_deprecated[type];
    }
  
    const backend::Backend *getBackend(const model::OperationIndex &index)
diff --git a/runtimes/neurun/core/src/compiler/Scheduler.cc b/runtimes/neurun/core/src/compiler/Scheduler.cc

index 91efc29..e4700d9 100644 (file)
--- a/runtimes/neurun/core/src/compiler/Scheduler.cc
+++ b/runtimes/neurun/core/src/compiler/Scheduler.cc
@@ -43,7 +43,7 @@ std::unique_ptr<compiler::BackendResolver> Scheduler::schedule(const graph::Grap
    const auto all_backends = _backend_resolver->getAllBackends();
    for (const auto *backend : all_backends)
    {
-    _backends_avail_time.emplace(backend, 0);
+    _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
    }
  
    // for each task select the backend with the smallest earliest finishing time(eft)
@@ -154,8 +154,9 @@ void Scheduler::makeRank(const graph::Graph &model)
            {
              // Permute layer is used for data transfer and it is implemented on cpu backend
              assert(_backend_resolver->getDefaultBackend()->config()->id() == "cpu");
-            transfer_cost = 100; // makes the scheduler prefer keeping computations on one backend
-            _run_cache[backend][other_backend->config()->id()] = 100;
+            // makes the scheduler prefer keeping computations on one backend
+            transfer_cost = operand.info().total_size() / 100;
+            _run_cache[backend][other_backend->config()->id()] = transfer_cost;
            }
            avg_transfer_cost += transfer_cost;
          }
@@ -230,11 +231,28 @@ void Scheduler::makeRank(const graph::Graph &model)
    VERBOSE(Scheduler::makeRank) << "task prioritizing finished" << std::endl;
  }
  
+int64_t Scheduler::backendAvailableTime(const backend::Backend *backend,
+                                        const int64_t &starting_time, const int64_t &time_amount)
+{
+  auto backend_times = _backends_avail_time.at(backend);
+  // finishing and starting times of an op, that will come after current op
+  auto next_op_fst = backend_times.upper_bound(starting_time);
+  // finishing time of an op, that will come before current op
+  auto prev_op_ft = starting_time;
+  // until reach the "hole/gap", that is enough to run this op
+  while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft < time_amount)
+  {
+    prev_op_ft = next_op_fst->first;
+    ++next_op_fst;
+  }
+  return prev_op_ft;
+}
+
  void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIndex &index)
  {
    VERBOSE(Scheduler::scheduleNode) << "scheduling (" << index.value() << ")" << std::endl;
    const auto all_backends = _backend_resolver->getAllBackends();
-  int64_t eft = std::numeric_limits<int64_t>::max();
+  int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
    const auto &node = model.operations().at(index);
    const auto &obj = model.operands().at(node.getInputs().at(0));
    bool quant = false;
@@ -245,11 +263,13 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
    }
  
    auto size = getOperationsFlattenedIOSize(model, node);
-
+  std::vector<std::pair<int64_t, int64_t>> selected_permute_avail_time;
    // select the backend with the smallest eft of this task
    const backend::Backend *chosen_backend = nullptr;
    for (const auto *backend : all_backends)
    {
+    std::vector<std::pair<int64_t, int64_t>> permute_avail_time;
+
      /* TODO: this is workaround, come up with better solution if have.
               Adding exception in stage doesn't help. Because if there is a record for add without
               broadcast, scheduling will select it since it doesn't distinguish broadcast and
@@ -286,12 +306,9 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
      }
      // get average exec time of the op on this backend
      auto exec_time = getTime(backend, node.getName(), quant, size);
-    VERBOSE(Scheduler::scheduleNode) << "expected exec_time of (" << index.value() << ") "
-                                     << node.getName() << " quant==" << quant << " on "
-                                     << backend->config()->id() << " is " << exec_time
-                                     << " microseconds" << std::endl;
      // get max eft of direct (one lavel above) predecessors
      int64_t max_pred_eft = 0;
+    int64_t total_transfer_cost = 0;
      for (const auto &input : node.getInputs())
      {
        const auto &operand = model.operands().at(input);
@@ -301,34 +318,53 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn
          // Data transfer cost from this parent's backend to current node's backend:
          auto parent_backend = _backend_resolver->getBackend(defs);
  
-        int64_t transfer_cost;
          if (parent_backend == backend)
          {
-          transfer_cost = 1;
+          max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs));
          }
          else
          {
-          transfer_cost =
+          int64_t transfer_cost =
                getTime(parent_backend, backend->config()->id(), quant, obj.info().total_size());
+          total_transfer_cost += transfer_cost;
+
+          // Assumes that cpu is free and proceeds to permutation right away
+          permute_avail_time.push_back({transfer_cost, _parents_eft.at(defs)});
+          max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs));
          }
-        max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs) + transfer_cost);
        }
      }
-    auto backend_avail_time = _backends_avail_time[backend];
-    // earliest starting time
-    auto est = std::max(backend_avail_time, max_pred_eft);
-    if (eft > est + exec_time)
+    max_pred_eft += total_transfer_cost;
+    // find the hole/gap, where this op can be put or the finishing time of the last assigned op
+    auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
+
+    VERBOSE(Scheduler::scheduleNode)
+        << "expected exec_time of (" << index.value() << ") " << node.getName()
+        << " quant==" << quant << " on " << backend->config()->id() << " is " << exec_time
+        << " microseconds. Backend available time: " << prev_op_ft
+        << " Parent's max eft: " << max_pred_eft - total_transfer_cost
+        << " total transfer cost: " << total_transfer_cost << std::endl;
+    if (eft > prev_op_ft + exec_time)
      {
-      eft = est + exec_time;
+      eft = prev_op_ft + exec_time;
+      selected_exec_time = exec_time;
        chosen_backend = backend;
+      selected_permute_avail_time = permute_avail_time;
      }
    }
+  for (const auto &it : selected_permute_avail_time)
+  {
+    auto prev_op_ft =
+        backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second);
+    _backends_avail_time[_backend_resolver->getBackend("cpu")].insert(
+        {prev_op_ft + it.second, prev_op_ft});
+  }
    VERBOSE(Scheduler::schedule) << "backend for " << node.getName() << " is "
                                 << chosen_backend->config()->id() << ". Its eft: " << eft
                                 << std::endl;
  
    _parents_eft[index] = eft;
-  _backends_avail_time[chosen_backend] = eft + 1;
+  _backends_avail_time[chosen_backend].insert({eft, eft - selected_exec_time});
    _backend_resolver->setBackend(index, chosen_backend);
  }
author	Дилшоджон Умронхонович Пошшоев/AI Tools Lab /SRR/Engineer/삼성전자 <d.poshshoev@samsung.com>
	Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)
committer	이한종/On-Device Lab(SR)/Engineer/삼성전자 <hanjoung.lee@samsung.com>
	Tue, 16 Jul 2019 10:23:04 +0000 (19:23 +0900)
runtimes/neurun/core/include/compiler/Scheduler.h		patch \| blob \| history
runtimes/neurun/core/src/compiler/BackendResolver.h		patch \| blob \| history
runtimes/neurun/core/src/compiler/Scheduler.cc		patch \| blob \| history