From 8e047d5fb38afd05770f97b6155c881ac44d74fa Mon Sep 17 00:00:00 2001
From: =?utf8?q?Ivan=20Vagin/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <ivan.vagin@samsung.com>
Date: Fri, 30 Aug 2019 11:17:48 +0900
Subject: [PATCH] [neurun] Implemented scheduler test with branched graph
 (#6926)

* [neurun] Implemented scheduler test with branched graph

* Implemented scheduler test with branched graph and known execution time
* Added set time methods to ExecTime
* Refactored existing scheduler test a little bit

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>

* 'set' methods from ExecutionTime replaced with helper functions

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>

* remove profile data before test

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>

* Review fixes

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>

* Increased execution time for GPU in last test

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>

* Set correct large exec time for gpu

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>
---
 runtimes/neurun/test/core/compiler/Scheduler.cc | 306 ++++++++++++++++++------
 1 file changed, 236 insertions(+), 70 deletions(-)
diff --git a/runtimes/neurun/test/core/compiler/Scheduler.cc b/runtimes/neurun/test/core/compiler/Scheduler.cc
index 5e5cb6c..4a56b08 100644
--- a/runtimes/neurun/test/core/compiler/Scheduler.cc
+++ b/runtimes/neurun/test/core/compiler/Scheduler.cc
@@ -24,6 +24,7 @@
 #include <model/DataType.h>
 
 #include <model/operation/AddNode.h>
+#include <model/operation/SubNode.h>
 #include <model/operation/MulNode.h>
 #include <model/operation/FullyConnectedNode.h>
 
@@ -34,6 +35,7 @@ namespace
 using namespace neurun;
 using namespace model;
 using namespace backend;
+using namespace operation;
 
 //
 // Mock backends classes
@@ -96,6 +98,7 @@ struct MockBackendNPU : public Backend
 
 const int OPERAND_ELEMS = 268203;
 const int OPERAND_SIZE = OPERAND_ELEMS * 4;
+const int OPERATION_SIZE = OPERAND_SIZE * 3;
 
 const std::string LINEAR("Linear");
 const std::string DATAFLOW("Dataflow");
@@ -109,51 +112,27 @@ const std::string PARALLEL("Parallel");
 void setExecutor(const std::string &executor) { setenv("EXECUTOR", executor.c_str(), true); }
 
 // Calculate operation size by addition sizes of all input and output operands
-uint32_t calcOpSize(const graph::Graph &graph, const OperationIndex &op_idx)
+uint32_t calcOpSize(const std::shared_ptr<graph::Graph> &graph, const OperationIndex &op_idx)
 {
   uint32_t size = 0;
-  for (const auto &input : graph.operations().at(op_idx).getInputs())
-    size += graph.operands().at(input).info().total_size();
-  for (const auto &output : graph.operations().at(op_idx).getOutputs())
-    size += graph.operands().at(output).info().total_size();
+  for (const auto &input : graph->operations().at(op_idx).getInputs())
+    size += graph->operands().at(input).info().total_size();
+  for (const auto &output : graph->operations().at(op_idx).getOutputs())
+    size += graph->operands().at(output).info().total_size();
   return size;
 }
 
-// Create straight graph: Add->Mul->FullyConnected
-std::unique_ptr<graph::Graph> createStraightGraph()
+// Set execution operation time. This method is needed since ExecutionTime has only
+// 'updateOperationExecTime' method.
+void setOperationExecTime(ExecTime &et, const Backend *backend, const std::string &operation,
+                          bool quant, uint32_t op_size, int64_t time)
 {
-  auto graph = nnfw::cpp14::make_unique<graph::Graph>(nnfw::cpp14::make_unique<Model>());
-
-  // Create operands
-  const TypeInfo float_op(DataType::FLOAT32);
-  auto add_lhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto add_rhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto add_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto mul_lhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto mul_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto fc_weights_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-  auto fc_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
-
-  // Create AddNode
-  operation::AddNode::Param add_param{Activation::NONE};
-  auto add_op = std::unique_ptr<operation::AddNode>(
-      new operation::AddNode({add_lhs_index, add_rhs_index}, {add_out_index}, add_param));
-  graph->addOperation(std::move(add_op));
-
-  // Create MulNode
-  operation::MulNode::Param mul_param{Activation::NONE};
-  auto mul_op = std::unique_ptr<operation::MulNode>(
-      new operation::MulNode({mul_lhs_index, add_out_index}, {mul_out_index}, mul_param));
-  graph->addOperation(std::move(mul_op));
-
-  // Create FullyConnectedNode
-  operation::FullyConnectedNode::Param fc_param{Activation::NONE};
-  auto fc_op = std::unique_ptr<operation::FullyConnectedNode>(new operation::FullyConnectedNode(
-      {fc_weights_index, mul_out_index}, {fc_out_index}, fc_param));
-  graph->addOperation(std::move(fc_op));
-
-  graph->finishBuilding();
-  return graph;
+  // You shouldn't set negative time with this method since nnfw JSON deserializer can't read it
+  assert(time > 0);
+  int64_t prev_time = et.getOperationExecTime(backend, operation, quant, op_size);
+  int64_t time_to_set = prev_time == ExecTime::NOT_FOUND ? time : 2 * time - prev_time;
+  et.updateOperationExecTime(backend, operation, quant, op_size, time_to_set);
+  assert(et.getOperationExecTime(backend, operation, quant, op_size) == time);
 }
 
 // Set same execution time for all given backends/operations
@@ -166,11 +145,24 @@ void setOperationsExecutionTime(const std::vector<const Backend *> &backends,
   for (int i = 0; i < op_names.size(); ++i)
   {
     for (auto &backend : backends)
-      et.updateOperationExecTime(backend, op_names[i], false, op_sizes[i], exec_time);
+      setOperationExecTime(et, backend, op_names[i], false, op_sizes[i], exec_time);
   }
   et.uploadOperationsExecTime();
 }
 
+// Set permute time from one backend to another. This method is needed since ExecutionTime has only
+// 'updatePermuteTime' method.
+void setPermutationTime(ExecTime &et, const Backend *from_backend, const Backend *to_backend,
+                        bool quant, uint32_t op_size, int64_t time)
+{
+  // You shouldn't set negative time with this method since nnfw JSON deserializer can't read it
+  assert(time > 0);
+  int64_t prev_time = et.getPermuteTime(from_backend, to_backend, quant, op_size);
+  int64_t time_to_set = prev_time == ExecTime::NOT_FOUND ? time : 2 * time - prev_time;
+  et.updatePermuteTime(from_backend, to_backend, quant, op_size, time_to_set);
+  assert(et.getPermuteTime(from_backend, to_backend, quant, op_size) == time);
+}
+
 // Set same permutation time between all given backends
 void setPermutationsExecutionTime(const std::vector<const Backend *> &backends,
                                   const int operand_size, const int64_t exec_time)
@@ -182,13 +174,104 @@ void setPermutationsExecutionTime(const std::vector<const Backend *> &backends,
     {
       if (backend == other_backend)
         continue;
-      et.updatePermuteTime(backend, other_backend, false, operand_size, exec_time);
+      setPermutationTime(et, backend, other_backend, false, operand_size, exec_time);
     }
   }
   et.uploadOperationsExecTime();
 }
 
 //
+// Functions for creating graphs
+//
+
+using OIS = OperandIndexSequence;
+
+template <typename NodeT, typename... Types>
+model::OperationIndex createNode(std::shared_ptr<graph::Graph> graph, Types &&... args)
+{
+  typename NodeT::Param op_params{Activation::NONE};
+  auto op = nnfw::cpp14::make_unique<NodeT>(std::forward<Types>(args)..., op_params);
+  auto op_idx = graph->addOperation(std::move(op));
+  // For now in scheduler test all operations in tested graphs has same size (for simplicity)
+  assert(calcOpSize(graph, op_idx) == OPERATION_SIZE);
+  return op_idx;
+}
+
+// Create straight graph: Add->Sub->Mul
+std::shared_ptr<graph::Graph> createStraightGraph()
+{
+  auto graph = std::make_shared<graph::Graph>(nnfw::cpp14::make_unique<Model>());
+  const TypeInfo float_op(DataType::FLOAT32);
+
+  // Create add node
+  auto add_lhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto add_rhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto add_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<AddNode>(graph, OIS{add_lhs_idx, add_rhs_idx}, OIS{add_out_idx});
+
+  // Create sub node
+  auto sub_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto sub_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<SubNode>(graph, OIS{add_out_idx, sub_const_idx}, OIS{sub_out_idx});
+
+  // Create mul node
+  auto mul_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto mul_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<MulNode>(graph, OIS{sub_out_idx, mul_const_idx}, OIS{mul_out_idx});
+
+  graph->finishBuilding();
+  return graph;
+}
+
+/* Create branched graph:
+ *       [Add]
+ *      //   \\
+ *   [Mul1]  [FC2]
+ *     ||     ||
+ *   [Mul2]  [FC2]
+ *      \\   //
+ *       [Sub]
+ */
+std::shared_ptr<graph::Graph> createBranchedGraph()
+{
+  auto graph = std::make_shared<graph::Graph>(nnfw::cpp14::make_unique<Model>());
+  const TypeInfo float_op(DataType::FLOAT32);
+
+  // Create add node
+  auto add_lhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto add_rhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto add_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<AddNode>(graph, OIS{add_lhs_idx, add_rhs_idx}, OIS{add_out_idx});
+
+  // Create mul1 node
+  auto mul1_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto mul1_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<MulNode>(graph, OIS{add_out_idx, mul1_const_idx}, OIS{mul1_out_idx});
+
+  // Create mul2 node
+  auto mul2_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto mul2_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<MulNode>(graph, OIS{mul1_out_idx, mul2_const_idx}, OIS{mul2_out_idx});
+
+  // Create fc1 node
+  auto fc1_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto fc1_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<FullyConnectedNode>(graph, OIS{add_out_idx, fc1_const_idx}, OIS{fc1_out_idx});
+
+  // Create fc2 node
+  auto fc2_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  auto fc2_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<FullyConnectedNode>(graph, OIS{fc1_out_idx, fc2_const_idx}, OIS{fc2_out_idx});
+
+  // Create add2 node
+  auto sub_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op);
+  createNode<SubNode>(graph, OIS{mul2_out_idx, fc2_out_idx}, OIS{sub_out_idx});
+
+  graph->finishBuilding();
+  return graph;
+}
+
+//
 // Tests setup/teardown
 //
 
@@ -204,6 +287,9 @@ protected:
     _npu_backend = new MockBackendNPU();
     _mock_backends = {_cpu_backend, _gpu_backend, _npu_backend};
 
+    // Remove previous profile data if it exists
+    remove("exec_time.json");
+
     // Remember original value of 'EXECUTOR' environment variable
     char *executor = std::getenv("EXECUTOR");
     _original_executor = executor == nullptr ? "" : executor;
@@ -226,8 +312,8 @@ protected:
   std::string _original_executor;
 };
 
-class SchedulerTestWithStringParam : public SchedulerTest,
-                                     public testing::WithParamInterface<std::string>
+class SchedulerTestWithExecutorParam : public SchedulerTest,
+                                       public testing::WithParamInterface<std::string>
 {
 };
 
@@ -236,55 +322,135 @@ class SchedulerTestWithStringParam : public SchedulerTest,
 //
 
 // Test scheduler behavior for straight graph with known execution time of all nodes and permutes.
-// This test is parameterized with executor name and runs three times - one time for each executor.
-TEST_P(SchedulerTestWithStringParam, straight_graph_known_exec_time)
+TEST_P(SchedulerTestWithExecutorParam, straight_graph_known_exec_time)
 {
   setExecutor(GetParam());
 
   // Prepare graph
-  std::shared_ptr<graph::Graph> graph;
-  graph = createStraightGraph();
-  OperationIndex add_op_idx(0), mul_op_idx(1), fc_op_idx(2);
-  auto add_op_size = calcOpSize(*graph, add_op_idx);
-  auto mul_op_size = calcOpSize(*graph, mul_op_idx);
-  auto fc_op_size = calcOpSize(*graph, fc_op_idx);
+  auto graph(createStraightGraph());
+  OperationIndex add_op_idx(0), sub_op_idx(1), mul_op_idx(2);
+
+  // Set default execution and transfer time
+  setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1);
+  setOperationsExecutionTime(_mock_backends, {"Add", "Sub", "Mul"},
+                             {OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE}, 1e4);
 
   // Test 1
-  // Expected behaviour: scheduler plan each node on different backend
+  // Expected behaviour: scheduler assigns different backend to each node
   {
-    // Prepare execution time
-    setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1);
-    setOperationsExecutionTime(_mock_backends, {"Add", "Mul", "FullyConnected"},
-                               {add_op_size, mul_op_size, fc_op_size}, 10000);
+    // For each backend reduce execution time of one node
     ExecTime et(_mock_backends);
-    et.updateOperationExecTime(_cpu_backend, "Add", false, add_op_size, 1);
-    et.updateOperationExecTime(_gpu_backend, "Mul", false, mul_op_size, 1);
-    et.updateOperationExecTime(_npu_backend, "FullyConnected", false, fc_op_size, 1);
+    setOperationExecTime(et, _cpu_backend, "Add", false, OPERATION_SIZE, 1);
+    setOperationExecTime(et, _gpu_backend, "Sub", false, OPERATION_SIZE, 1);
+    setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, 1);
     et.uploadOperationsExecTime();
 
     // Test scheduler
     auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends);
-    const auto backend_resolver = scheduler.schedule(*graph);
-    ASSERT_EQ(backend_resolver->getBackend(add_op_idx)->config()->id(), "cpu");
-    ASSERT_EQ(backend_resolver->getBackend(mul_op_idx)->config()->id(), "gpu");
-    ASSERT_EQ(backend_resolver->getBackend(fc_op_idx)->config()->id(), "npu");
+    const auto br = scheduler.schedule(*graph);
+    ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu");
+    ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "gpu");
+    ASSERT_EQ(br->getBackend(mul_op_idx)->config()->id(), "npu");
   }
 
   // Test 2
-  // Expected behaviour: scheduler plan all nodes on single backend because of big transfer time
+  // Expected behaviour: scheduler assigns single backend to all nodes because of big transfer time
   {
     // Increase transfer time
-    setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 100000);
+    setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1e5);
+
+    // Test scheduler
+    auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends);
+    const auto br = scheduler.schedule(*graph);
+    ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu");
+    ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "cpu");
+    ASSERT_EQ(br->getBackend(mul_op_idx)->config()->id(), "cpu");
+  }
+}
+
+// Test scheduler behavior for branched graph with known execution time of all nodes and permutes
+TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time)
+{
+  const int64_t NPU_ET = 5000;
+  setExecutor(GetParam());
+
+  // Prepare graph
+  auto graph(createBranchedGraph());
+  OperationIndex add_op_idx(0), mul1_op_idx(1), mul2_op_idx(2), fc1_op_idx(3), fc2_op_idx(4),
+      sub_op_idx(5);
+
+  // Set default execution and transfer time
+  setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1000);
+  setOperationsExecutionTime(_mock_backends, {"Add", "Sub", "Mul", "FullyConnected"},
+                             {OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE}, 1e4);
+
+  // Test 1
+  // Expected behaviour: for dataflow and linear executors scheduler assigns fastest backend to all
+  // nodes, in case of parallel executor scheduler assigns different backends to branches.
+  {
+    // Reduce execution time
+    ExecTime et(_mock_backends);
+    setOperationExecTime(et, _npu_backend, "Add", false, OPERATION_SIZE, NPU_ET);
+    setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, NPU_ET);
+    setOperationExecTime(et, _npu_backend, "Sub", false, OPERATION_SIZE, NPU_ET);
+    setOperationExecTime(et, _npu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET);
+    setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET + 1000);
+    setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET + 1000);
+    et.uploadOperationsExecTime();
+
+    // Test scheduler
+    auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends);
+    const auto br = scheduler.schedule(*graph);
+
+    std::string branch1_expected_backend("npu"), branch2_expected_backend("npu");
+    if (GetParam() == PARALLEL)
+    {
+      branch1_expected_backend =
+          br->getBackend(mul1_op_idx)->config()->id() == "npu" ? "npu" : "gpu";
+      branch2_expected_backend = branch1_expected_backend == "npu" ? "gpu" : "npu";
+    }
+
+    ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), branch1_expected_backend);
+    ASSERT_EQ(br->getBackend(mul2_op_idx)->config()->id(), branch1_expected_backend);
+    ASSERT_EQ(br->getBackend(fc1_op_idx)->config()->id(), branch2_expected_backend);
+    ASSERT_EQ(br->getBackend(fc2_op_idx)->config()->id(), branch2_expected_backend);
+    ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "npu");
+  }
+
+  // Test 2
+  // Expected behaviour: scheduler assigns single backend to all nodes
+  {
+    // Increase execution time for GPU backend
+    ExecTime et(_mock_backends);
+    /* for parallel executor: set a time, that is larger than branches_cnt*npu_exec_time
+        so that npu is prefered: the i+1 level node of the first branch will wait for npu
+        until it finishes the i-th nodes of all other branches in BFS order*/
+    setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET * 2 + 1);
+    /* for parallel executor: set ET of FC larger than Mul's to be determinant:
+        if they are equal and scheduling is done in order mul1->FC1->FC2->mul2,
+        then for mul2 gpu is selected since NPU_ET*3 > GPU_ET(which is NPU_ET * 2 + 1)*/
+    setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET * 2 + 2);
+    et.uploadOperationsExecTime();
 
     // Test scheduler
     auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends);
-    const auto backend_resolver = scheduler.schedule(*graph);
-    ASSERT_EQ(backend_resolver->getBackend(add_op_idx)->config()->id(), "cpu");
-    ASSERT_EQ(backend_resolver->getBackend(mul_op_idx)->config()->id(), "cpu");
-    ASSERT_EQ(backend_resolver->getBackend(fc_op_idx)->config()->id(), "cpu");
+    const auto br = scheduler.schedule(*graph);
+    ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(mul2_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(fc1_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(fc2_op_idx)->config()->id(), "npu");
+    ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "npu");
   }
 }
-INSTANTIATE_TEST_CASE_P(AllExecutors, SchedulerTestWithStringParam,
+
+// SchedulerTestWithExecutorParam tests are parameterized with executor name and runs three times -
+// one time for each executor
+INSTANTIATE_TEST_CASE_P(AllExecutors, SchedulerTestWithExecutorParam,
                         testing::Values(LINEAR, DATAFLOW, PARALLEL));
 
+// TODO: Add tests with unknown execution and permutation time
+// TODO: Add tests for scheduler profiling mode
+
 } // unnamed namespace
-- 
2.7.4