From 8a75031d6efa7d796f3e4628624ac09af9ab5e48 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=94=D0=B8=D0=BB=D1=88=D0=BE=D0=B4=D0=B6=D0=BE=D0=BD=20?=
 =?utf8?q?=D0=9F=D0=BE=D1=88=D1=88=D0=BE=D0=B5=D0=B2/AI=20Tools=20Lab=20/S?=
 =?utf8?q?RR/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <d.poshshoev@samsung.com>
Date: Fri, 4 Oct 2019 10:01:51 +0300
Subject: [PATCH] [neurun] Schedule branch by branch instead of node (#6965)

* [neurun] Schedule branch by branch instead of node

Related issue: 6963
Schedule whole branch (that can be seen as a subgraph) recursively
before moving to next node

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>

* Fix CI failure

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>

* Address offline review

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>

* Fix review feedbacks

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>

* Address feedbacks: rename scheduleRecursively

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>

* Fix documentation typo

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>
---
 runtimes/neurun/core/src/compiler/HEScheduler.cc | 52 +++++++++++++++++++++++-
 runtimes/neurun/core/src/compiler/HEScheduler.h  | 15 ++++++-
 runtimes/neurun/test/core/compiler/Scheduler.cc  | 14 +++----
 3 files changed, 70 insertions(+), 11 deletions(-)
diff --git a/runtimes/neurun/core/src/compiler/HEScheduler.cc b/runtimes/neurun/core/src/compiler/HEScheduler.cc
index bbcb4d6..c2dec06 100644
--- a/runtimes/neurun/core/src/compiler/HEScheduler.cc
+++ b/runtimes/neurun/core/src/compiler/HEScheduler.cc
@@ -185,6 +185,43 @@ bool HEScheduler::isNodeProfiled(const model::Operation &node)
   return true;
 }
 
+void HEScheduler::scheduleBranch(const model::OperationIndex &index,
+                                 model::OperationIndexMap<bool> &scheduled)
+{
+  auto loc_index = index;
+  const backend::Backend *parent_backend = nullptr;
+  while (true)
+  {
+    if (scheduled[loc_index])
+    {
+      return;
+    }
+    if (!scheduleNode(loc_index, parent_backend))
+    {
+      return;
+    }
+    scheduled[loc_index] = true;
+    parent_backend = _backend_resolver->getBackend(loc_index);
+
+    const auto &node = _graph->operations().at(loc_index);
+    model::OperandIndex tmp;
+    /* get the only output operand, that is input of the next single operation
+     *   and just this nodes output.*/
+    if (node.getOutputs().size() != 1)
+    {
+      return;
+    }
+    const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
+    loc_index = only_out_operand.getUses().list().front();
+    /* verify, that next node is neither beginning nor ending node of a branch*/
+    const auto &next_node = _graph->operations().at(loc_index);
+    if (!isMergable(*_graph, next_node))
+    {
+      return;
+    }
+  }
+}
+
 std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const graph::Graph &graph)
 {
   _graph = &graph;
@@ -217,10 +254,14 @@ std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const graph::Gr
     }
   }
 
+  model::OperationIndexMap<bool> visited;
+  graph.operations().iterate([&](const model::OperationIndex &index, const model::Operation &) {
+    visited[index] = false;
+  });
   // for each task select the backend with the smallest earliest finishing time(eft)
   for (const auto &rank : _rank_to_op)
   {
-    scheduleNode(rank.second);
+    scheduleBranch(rank.second, visited);
   }
   VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
   return std::move(_backend_resolver);
@@ -416,7 +457,8 @@ int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
   return prev_op_ft;
 }
 
-void HEScheduler::scheduleNode(const model::OperationIndex &index)
+bool HEScheduler::scheduleNode(const model::OperationIndex &index,
+                               const backend::Backend *parent_backend)
 {
   VERBOSE(HEScheduler::scheduleNode) << "scheduling (" << index.value() << ")" << std::endl;
   int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
@@ -444,6 +486,11 @@ void HEScheduler::scheduleNode(const model::OperationIndex &index)
     throw std::runtime_error{"Fail to choose backend on scheduler"};
   }
 
+  // this is part of a branch and it is assigned another backend
+  if (parent_backend && parent_backend != chosen_backend)
+  {
+    return false;
+  }
   for (const auto &it : selected_transfer_st_exec_time)
   {
     auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
@@ -457,6 +504,7 @@ void HEScheduler::scheduleNode(const model::OperationIndex &index)
   VERBOSE(HEScheduler::scheduleNode) << "backend for " << node.getName() << " is "
                                      << chosen_backend->config()->id() << ". Its eft: " << eft
                                      << std::endl;
+  return true;
 }
 
 std::pair<int64_t, int64_t>
diff --git a/runtimes/neurun/core/src/compiler/HEScheduler.h b/runtimes/neurun/core/src/compiler/HEScheduler.h
index ba4ebac..1657482 100644
--- a/runtimes/neurun/core/src/compiler/HEScheduler.h
+++ b/runtimes/neurun/core/src/compiler/HEScheduler.h
@@ -81,7 +81,7 @@ public:
 private:
   bool isNodeProfiled(const model::Operation &);
 
-  void scheduleNode(const model::OperationIndex &);
+  bool scheduleNode(const model::OperationIndex &, const backend::Backend *parent_backend);
   /**
    * @brief   Get earliest starting time and execution time of an operation on a backend.
    *
@@ -139,6 +139,19 @@ private:
 
   int64_t tryBackend(const model::Operation &node, const backend::Backend *backend);
 
+  /**
+   * @brief   Schedule a node and its successor until:
+   *            1. there is no branching or connection of multiple branches
+   *            2. for subsequent nodes: other than predecessor's backend is prefered
+   *
+   * @param[in] index: index of an operation
+   * @param[in] scheduled: a map to check if this node has already been scheduled
+   *
+   * @return N/A
+   */
+  void scheduleBranch(const model::OperationIndex &index,
+                      model::OperationIndexMap<bool> &scheduled);
+
 private:
   // This variable stores backend/node pairs with unknown execution time, and hints scheduler
   // whether it should assign these backends to these nodes:
diff --git a/runtimes/neurun/test/core/compiler/Scheduler.cc b/runtimes/neurun/test/core/compiler/Scheduler.cc
index cf82261..2f00629 100644
--- a/runtimes/neurun/test/core/compiler/Scheduler.cc
+++ b/runtimes/neurun/test/core/compiler/Scheduler.cc
@@ -453,14 +453,12 @@ TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
   {
     // Increase execution time for GPU backend
     ExecTime et(_mock_backends);
-    /* for parallel executor: set a time, that is larger than branches_cnt*npu_exec_time
-        so that npu is prefered: the i+1 level node of the first branch will wait for npu
-        until it finishes the i-th nodes of all other branches in BFS order*/
-    setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET * 2 + 1);
-    /* for parallel executor: set ET of FC larger than Mul's to be determinant:
-        if they are equal and scheduling is done in order mul1->FC1->FC2->mul2,
-        then for mul2 gpu is selected since NPU_ET*3 > GPU_ET(which is NPU_ET * 2 + 1)*/
-    setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET * 2 + 2);
+    /* for parallel executor: set a time, that is larger than sum_of_other_branches_nodes_cnt *
+     * npu_exec_time so that npu is prefered: the ith branch will wait for npu until it finishes the
+     * [0;i-1] branches nodes in DFS order. In each branch it goes deep intul doesn't encounter
+     * branching or scheduler assigns another backend to a node*/
+    setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET * 3 + 1);
+    setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET * 3 + 1);
     et.uploadOperationsExecTime();
 
     // Test scheduler
-- 
2.7.4