From 8e047d5fb38afd05770f97b6155c881ac44d74fa Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ivan=20Vagin/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?= =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Fri, 30 Aug 2019 11:17:48 +0900 Subject: [PATCH] [neurun] Implemented scheduler test with branched graph (#6926) * [neurun] Implemented scheduler test with branched graph * Implemented scheduler test with branched graph and known execution time * Added set time methods to ExecTime * Refactored existing scheduler test a little bit Signed-off-by: Ivan Vagin * 'set' methods from ExecutionTime replaced with helper functions Signed-off-by: Ivan Vagin * remove profile data before test Signed-off-by: Ivan Vagin * Review fixes Signed-off-by: Ivan Vagin * Increased execution time for GPU in last test Signed-off-by: Ivan Vagin * Set correct large exec time for gpu Signed-off-by: Poshshoev Dilshodzhon --- runtimes/neurun/test/core/compiler/Scheduler.cc | 306 ++++++++++++++++++------ 1 file changed, 236 insertions(+), 70 deletions(-) diff --git a/runtimes/neurun/test/core/compiler/Scheduler.cc b/runtimes/neurun/test/core/compiler/Scheduler.cc index 5e5cb6c..4a56b08 100644 --- a/runtimes/neurun/test/core/compiler/Scheduler.cc +++ b/runtimes/neurun/test/core/compiler/Scheduler.cc @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -34,6 +35,7 @@ namespace using namespace neurun; using namespace model; using namespace backend; +using namespace operation; // // Mock backends classes @@ -96,6 +98,7 @@ struct MockBackendNPU : public Backend const int OPERAND_ELEMS = 268203; const int OPERAND_SIZE = OPERAND_ELEMS * 4; +const int OPERATION_SIZE = OPERAND_SIZE * 3; const std::string LINEAR("Linear"); const std::string DATAFLOW("Dataflow"); @@ -109,51 +112,27 @@ const std::string PARALLEL("Parallel"); void setExecutor(const std::string &executor) { setenv("EXECUTOR", executor.c_str(), true); } // Calculate operation size by addition sizes of all input and output operands -uint32_t calcOpSize(const graph::Graph &graph, const OperationIndex &op_idx) +uint32_t calcOpSize(const std::shared_ptr &graph, const OperationIndex &op_idx) { uint32_t size = 0; - for (const auto &input : graph.operations().at(op_idx).getInputs()) - size += graph.operands().at(input).info().total_size(); - for (const auto &output : graph.operations().at(op_idx).getOutputs()) - size += graph.operands().at(output).info().total_size(); + for (const auto &input : graph->operations().at(op_idx).getInputs()) + size += graph->operands().at(input).info().total_size(); + for (const auto &output : graph->operations().at(op_idx).getOutputs()) + size += graph->operands().at(output).info().total_size(); return size; } -// Create straight graph: Add->Mul->FullyConnected -std::unique_ptr createStraightGraph() +// Set execution operation time. This method is needed since ExecutionTime has only +// 'updateOperationExecTime' method. +void setOperationExecTime(ExecTime &et, const Backend *backend, const std::string &operation, + bool quant, uint32_t op_size, int64_t time) { - auto graph = nnfw::cpp14::make_unique(nnfw::cpp14::make_unique()); - - // Create operands - const TypeInfo float_op(DataType::FLOAT32); - auto add_lhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto add_rhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto add_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto mul_lhs_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto mul_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto fc_weights_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - auto fc_out_index = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); - - // Create AddNode - operation::AddNode::Param add_param{Activation::NONE}; - auto add_op = std::unique_ptr( - new operation::AddNode({add_lhs_index, add_rhs_index}, {add_out_index}, add_param)); - graph->addOperation(std::move(add_op)); - - // Create MulNode - operation::MulNode::Param mul_param{Activation::NONE}; - auto mul_op = std::unique_ptr( - new operation::MulNode({mul_lhs_index, add_out_index}, {mul_out_index}, mul_param)); - graph->addOperation(std::move(mul_op)); - - // Create FullyConnectedNode - operation::FullyConnectedNode::Param fc_param{Activation::NONE}; - auto fc_op = std::unique_ptr(new operation::FullyConnectedNode( - {fc_weights_index, mul_out_index}, {fc_out_index}, fc_param)); - graph->addOperation(std::move(fc_op)); - - graph->finishBuilding(); - return graph; + // You shouldn't set negative time with this method since nnfw JSON deserializer can't read it + assert(time > 0); + int64_t prev_time = et.getOperationExecTime(backend, operation, quant, op_size); + int64_t time_to_set = prev_time == ExecTime::NOT_FOUND ? time : 2 * time - prev_time; + et.updateOperationExecTime(backend, operation, quant, op_size, time_to_set); + assert(et.getOperationExecTime(backend, operation, quant, op_size) == time); } // Set same execution time for all given backends/operations @@ -166,11 +145,24 @@ void setOperationsExecutionTime(const std::vector &backends, for (int i = 0; i < op_names.size(); ++i) { for (auto &backend : backends) - et.updateOperationExecTime(backend, op_names[i], false, op_sizes[i], exec_time); + setOperationExecTime(et, backend, op_names[i], false, op_sizes[i], exec_time); } et.uploadOperationsExecTime(); } +// Set permute time from one backend to another. This method is needed since ExecutionTime has only +// 'updatePermuteTime' method. +void setPermutationTime(ExecTime &et, const Backend *from_backend, const Backend *to_backend, + bool quant, uint32_t op_size, int64_t time) +{ + // You shouldn't set negative time with this method since nnfw JSON deserializer can't read it + assert(time > 0); + int64_t prev_time = et.getPermuteTime(from_backend, to_backend, quant, op_size); + int64_t time_to_set = prev_time == ExecTime::NOT_FOUND ? time : 2 * time - prev_time; + et.updatePermuteTime(from_backend, to_backend, quant, op_size, time_to_set); + assert(et.getPermuteTime(from_backend, to_backend, quant, op_size) == time); +} + // Set same permutation time between all given backends void setPermutationsExecutionTime(const std::vector &backends, const int operand_size, const int64_t exec_time) @@ -182,13 +174,104 @@ void setPermutationsExecutionTime(const std::vector &backends, { if (backend == other_backend) continue; - et.updatePermuteTime(backend, other_backend, false, operand_size, exec_time); + setPermutationTime(et, backend, other_backend, false, operand_size, exec_time); } } et.uploadOperationsExecTime(); } // +// Functions for creating graphs +// + +using OIS = OperandIndexSequence; + +template +model::OperationIndex createNode(std::shared_ptr graph, Types &&... args) +{ + typename NodeT::Param op_params{Activation::NONE}; + auto op = nnfw::cpp14::make_unique(std::forward(args)..., op_params); + auto op_idx = graph->addOperation(std::move(op)); + // For now in scheduler test all operations in tested graphs has same size (for simplicity) + assert(calcOpSize(graph, op_idx) == OPERATION_SIZE); + return op_idx; +} + +// Create straight graph: Add->Sub->Mul +std::shared_ptr createStraightGraph() +{ + auto graph = std::make_shared(nnfw::cpp14::make_unique()); + const TypeInfo float_op(DataType::FLOAT32); + + // Create add node + auto add_lhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto add_rhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto add_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{add_lhs_idx, add_rhs_idx}, OIS{add_out_idx}); + + // Create sub node + auto sub_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto sub_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{add_out_idx, sub_const_idx}, OIS{sub_out_idx}); + + // Create mul node + auto mul_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto mul_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{sub_out_idx, mul_const_idx}, OIS{mul_out_idx}); + + graph->finishBuilding(); + return graph; +} + +/* Create branched graph: + * [Add] + * // \\ + * [Mul1] [FC2] + * || || + * [Mul2] [FC2] + * \\ // + * [Sub] + */ +std::shared_ptr createBranchedGraph() +{ + auto graph = std::make_shared(nnfw::cpp14::make_unique()); + const TypeInfo float_op(DataType::FLOAT32); + + // Create add node + auto add_lhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto add_rhs_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto add_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{add_lhs_idx, add_rhs_idx}, OIS{add_out_idx}); + + // Create mul1 node + auto mul1_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto mul1_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{add_out_idx, mul1_const_idx}, OIS{mul1_out_idx}); + + // Create mul2 node + auto mul2_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto mul2_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{mul1_out_idx, mul2_const_idx}, OIS{mul2_out_idx}); + + // Create fc1 node + auto fc1_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto fc1_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{add_out_idx, fc1_const_idx}, OIS{fc1_out_idx}); + + // Create fc2 node + auto fc2_const_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + auto fc2_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{fc1_out_idx, fc2_const_idx}, OIS{fc2_out_idx}); + + // Create add2 node + auto sub_out_idx = graph->addOperand(Shape{OPERAND_ELEMS}, float_op); + createNode(graph, OIS{mul2_out_idx, fc2_out_idx}, OIS{sub_out_idx}); + + graph->finishBuilding(); + return graph; +} + +// // Tests setup/teardown // @@ -204,6 +287,9 @@ protected: _npu_backend = new MockBackendNPU(); _mock_backends = {_cpu_backend, _gpu_backend, _npu_backend}; + // Remove previous profile data if it exists + remove("exec_time.json"); + // Remember original value of 'EXECUTOR' environment variable char *executor = std::getenv("EXECUTOR"); _original_executor = executor == nullptr ? "" : executor; @@ -226,8 +312,8 @@ protected: std::string _original_executor; }; -class SchedulerTestWithStringParam : public SchedulerTest, - public testing::WithParamInterface +class SchedulerTestWithExecutorParam : public SchedulerTest, + public testing::WithParamInterface { }; @@ -236,55 +322,135 @@ class SchedulerTestWithStringParam : public SchedulerTest, // // Test scheduler behavior for straight graph with known execution time of all nodes and permutes. -// This test is parameterized with executor name and runs three times - one time for each executor. -TEST_P(SchedulerTestWithStringParam, straight_graph_known_exec_time) +TEST_P(SchedulerTestWithExecutorParam, straight_graph_known_exec_time) { setExecutor(GetParam()); // Prepare graph - std::shared_ptr graph; - graph = createStraightGraph(); - OperationIndex add_op_idx(0), mul_op_idx(1), fc_op_idx(2); - auto add_op_size = calcOpSize(*graph, add_op_idx); - auto mul_op_size = calcOpSize(*graph, mul_op_idx); - auto fc_op_size = calcOpSize(*graph, fc_op_idx); + auto graph(createStraightGraph()); + OperationIndex add_op_idx(0), sub_op_idx(1), mul_op_idx(2); + + // Set default execution and transfer time + setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1); + setOperationsExecutionTime(_mock_backends, {"Add", "Sub", "Mul"}, + {OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE}, 1e4); // Test 1 - // Expected behaviour: scheduler plan each node on different backend + // Expected behaviour: scheduler assigns different backend to each node { - // Prepare execution time - setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1); - setOperationsExecutionTime(_mock_backends, {"Add", "Mul", "FullyConnected"}, - {add_op_size, mul_op_size, fc_op_size}, 10000); + // For each backend reduce execution time of one node ExecTime et(_mock_backends); - et.updateOperationExecTime(_cpu_backend, "Add", false, add_op_size, 1); - et.updateOperationExecTime(_gpu_backend, "Mul", false, mul_op_size, 1); - et.updateOperationExecTime(_npu_backend, "FullyConnected", false, fc_op_size, 1); + setOperationExecTime(et, _cpu_backend, "Add", false, OPERATION_SIZE, 1); + setOperationExecTime(et, _gpu_backend, "Sub", false, OPERATION_SIZE, 1); + setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, 1); et.uploadOperationsExecTime(); // Test scheduler auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends); - const auto backend_resolver = scheduler.schedule(*graph); - ASSERT_EQ(backend_resolver->getBackend(add_op_idx)->config()->id(), "cpu"); - ASSERT_EQ(backend_resolver->getBackend(mul_op_idx)->config()->id(), "gpu"); - ASSERT_EQ(backend_resolver->getBackend(fc_op_idx)->config()->id(), "npu"); + const auto br = scheduler.schedule(*graph); + ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu"); + ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "gpu"); + ASSERT_EQ(br->getBackend(mul_op_idx)->config()->id(), "npu"); } // Test 2 - // Expected behaviour: scheduler plan all nodes on single backend because of big transfer time + // Expected behaviour: scheduler assigns single backend to all nodes because of big transfer time { // Increase transfer time - setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 100000); + setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1e5); + + // Test scheduler + auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends); + const auto br = scheduler.schedule(*graph); + ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu"); + ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "cpu"); + ASSERT_EQ(br->getBackend(mul_op_idx)->config()->id(), "cpu"); + } +} + +// Test scheduler behavior for branched graph with known execution time of all nodes and permutes +TEST_P(SchedulerTestWithExecutorParam, branched_graph_known_exec_time) +{ + const int64_t NPU_ET = 5000; + setExecutor(GetParam()); + + // Prepare graph + auto graph(createBranchedGraph()); + OperationIndex add_op_idx(0), mul1_op_idx(1), mul2_op_idx(2), fc1_op_idx(3), fc2_op_idx(4), + sub_op_idx(5); + + // Set default execution and transfer time + setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1000); + setOperationsExecutionTime(_mock_backends, {"Add", "Sub", "Mul", "FullyConnected"}, + {OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE, OPERATION_SIZE}, 1e4); + + // Test 1 + // Expected behaviour: for dataflow and linear executors scheduler assigns fastest backend to all + // nodes, in case of parallel executor scheduler assigns different backends to branches. + { + // Reduce execution time + ExecTime et(_mock_backends); + setOperationExecTime(et, _npu_backend, "Add", false, OPERATION_SIZE, NPU_ET); + setOperationExecTime(et, _npu_backend, "Mul", false, OPERATION_SIZE, NPU_ET); + setOperationExecTime(et, _npu_backend, "Sub", false, OPERATION_SIZE, NPU_ET); + setOperationExecTime(et, _npu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET); + setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET + 1000); + setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET + 1000); + et.uploadOperationsExecTime(); + + // Test scheduler + auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends); + const auto br = scheduler.schedule(*graph); + + std::string branch1_expected_backend("npu"), branch2_expected_backend("npu"); + if (GetParam() == PARALLEL) + { + branch1_expected_backend = + br->getBackend(mul1_op_idx)->config()->id() == "npu" ? "npu" : "gpu"; + branch2_expected_backend = branch1_expected_backend == "npu" ? "gpu" : "npu"; + } + + ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), branch1_expected_backend); + ASSERT_EQ(br->getBackend(mul2_op_idx)->config()->id(), branch1_expected_backend); + ASSERT_EQ(br->getBackend(fc1_op_idx)->config()->id(), branch2_expected_backend); + ASSERT_EQ(br->getBackend(fc2_op_idx)->config()->id(), branch2_expected_backend); + ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "npu"); + } + + // Test 2 + // Expected behaviour: scheduler assigns single backend to all nodes + { + // Increase execution time for GPU backend + ExecTime et(_mock_backends); + /* for parallel executor: set a time, that is larger than branches_cnt*npu_exec_time + so that npu is prefered: the i+1 level node of the first branch will wait for npu + until it finishes the i-th nodes of all other branches in BFS order*/ + setOperationExecTime(et, _gpu_backend, "Mul", false, OPERATION_SIZE, NPU_ET * 2 + 1); + /* for parallel executor: set ET of FC larger than Mul's to be determinant: + if they are equal and scheduling is done in order mul1->FC1->FC2->mul2, + then for mul2 gpu is selected since NPU_ET*3 > GPU_ET(which is NPU_ET * 2 + 1)*/ + setOperationExecTime(et, _gpu_backend, "FullyConnected", false, OPERATION_SIZE, NPU_ET * 2 + 2); + et.uploadOperationsExecTime(); // Test scheduler auto scheduler = compiler::Scheduler(graph->operands(), _mock_backends); - const auto backend_resolver = scheduler.schedule(*graph); - ASSERT_EQ(backend_resolver->getBackend(add_op_idx)->config()->id(), "cpu"); - ASSERT_EQ(backend_resolver->getBackend(mul_op_idx)->config()->id(), "cpu"); - ASSERT_EQ(backend_resolver->getBackend(fc_op_idx)->config()->id(), "cpu"); + const auto br = scheduler.schedule(*graph); + ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(mul2_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(fc1_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(fc2_op_idx)->config()->id(), "npu"); + ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "npu"); } } -INSTANTIATE_TEST_CASE_P(AllExecutors, SchedulerTestWithStringParam, + +// SchedulerTestWithExecutorParam tests are parameterized with executor name and runs three times - +// one time for each executor +INSTANTIATE_TEST_CASE_P(AllExecutors, SchedulerTestWithExecutorParam, testing::Values(LINEAR, DATAFLOW, PARALLEL)); +// TODO: Add tests with unknown execution and permutation time +// TODO: Add tests for scheduler profiling mode + } // unnamed namespace -- 2.7.4