2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "model/Operand.h"
18 #include "compiler/HEScheduler.h"
20 #include "util/ConfigSource.h"
21 #include "compiler/IExecutionBuilder.h"
22 #include "compiler/BackendResolver.h"
23 #include "backend/IShapeFixer.h"
24 #include "util/logging.h"
25 #include "util/Utils.h"
26 #include "exec/FunctionSequence.h"
36 static uint32_t getOperationsFlattenedIOSize(const graph::Graph &graph,
37 const model::Operation &node)
40 for (const auto &input : node.getInputs())
42 size += graph.operands().at(input).info().total_size();
44 for (const auto &output : node.getOutputs())
46 size += graph.operands().at(output).info().total_size();
51 static bool isQuant(const graph::Graph &graph, const model::Operation &node)
53 for (const auto &input : node.getInputs())
55 const auto &obj = graph.operands().at(input);
56 if (obj.typeInfo().type() == ir::DataType::QUANT8_ASYMM)
64 static bool isWorkaroundSkip(const graph::Graph &graph, const backend::Backend *backend,
65 const model::Operation &node, bool quant)
67 /* TODO: this is workaround, come up with better solution if have.
68 Adding exception in stage doesn't help. Because if there is a record for add without
69 broadcast, scheduling will select it since it doesn't distinguish broadcast and
70 non-broadcast like it does for quant non-quantized*/
71 if (backend->config()->id() == "cpu" &&
72 (node.opcode() == model::OpCode::Add || node.opcode() == model::OpCode::Sub ||
73 node.opcode() == model::OpCode::Mul))
75 const auto lhs_index{node.getInputs().at(model::operation::Add::Input::LHS)};
76 const auto rhs_index{node.getInputs().at(model::operation::Add::Input::RHS)};
77 /*Broadcasting isn't supported on CPU: no way to differ the existing exec_time record with and
78 * without broadcasting*/
79 if (!(graph.operands().at(lhs_index).shape() == graph.operands().at(rhs_index).shape()))
84 /* TODO: this is workaround, come up with better solution if have.
85 Adding exception in stage doesn't help. Because if there is a record for Mul without
86 broadcast, scheduling will select it since it doesn't distinguish broadcast and
87 non-broadcast like it does for quant non-quantized*/
88 else if (backend->config()->id() == "acl_neon" && node.opcode() == model::OpCode::Mul)
90 const auto lhs_index{node.getInputs().at(model::operation::Mul::Input::LHS)};
91 const auto rhs_index{node.getInputs().at(model::operation::Mul::Input::RHS)};
93 // Nontrivial broadcasting isn't supported yet
95 !(graph.operands().at(lhs_index).shape() == graph.operands().at(rhs_index).shape()))
103 // if a node can be merged into subgraph
104 static bool isMergable(const graph::Graph &graph, const model::Operation &node)
106 size_t prev_op_cnt = 0;
107 for (const auto &input : node.getInputs())
110 const auto &operand = graph.operands().at(input);
111 if (operand.isConstant())
114 // This operand is output of operation, not weight or bias
115 if (operand.getDef().list().size() > 0)
118 // Current node has multiple inputs as concat or at the beginning of the separated branch
119 if (prev_op_cnt > 1 || operand.getUses().list().size() > 1)
127 void HEScheduler::scheduleShufflingBackends()
129 VERBOSE(HEScheduler::schedule)
130 << "Started task scheduling: uses all backends to get more metrics for data transfer"
132 size_t backend_ind = 0;
133 for (const auto &rank : _rank_to_op)
135 VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second.value() << ")" << std::endl;
136 const auto &node = _graph->operations().at(rank.second);
137 const bool quant = isQuant(*_graph, node);
138 const auto size = getOperationsFlattenedIOSize(*_graph, node);
139 for (size_t i = 0;; ++i)
141 if (i == _all_backends.size())
143 // wasn't able to find backend
147 if (backend_ind == _all_backends.size())
151 if (isWorkaroundSkip(*_graph, _all_backends[backend_ind], node, quant))
156 const auto exec_time =
157 _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
158 // Scheduling to measure data transfer must be done after measuring all backends separately
159 assert(exec_time != _exec_time->NOT_FOUND);
160 if (exec_time == _exec_time->getMax())
165 _backend_resolver->setBackend(rank.second, _all_backends[backend_ind]);
166 VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
167 << _all_backends[backend_ind]->config()->id() << std::endl;
174 bool HEScheduler::isNodeProfiled(const model::Operation &node)
176 const bool quant = isQuant(*_graph, node);
177 const auto size = getOperationsFlattenedIOSize(*_graph, node);
178 for (const auto *backend : _all_backends)
180 const auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
181 if (exec_time == _exec_time->NOT_FOUND)
187 void HEScheduler::scheduleBranch(const model::OperationIndex &index,
188 model::OperationIndexMap<bool> &scheduled)
190 auto loc_index = index;
191 const backend::Backend *parent_backend = nullptr;
194 if (scheduled[loc_index])
198 if (!schedule(loc_index, parent_backend))
202 scheduled[loc_index] = true;
203 parent_backend = _backend_resolver->getBackend(loc_index);
205 const auto &node = _graph->operations().at(loc_index);
206 model::OperandIndex tmp;
207 /* get the only output operand, that is input of the next single operation
208 * and just this nodes output.*/
209 if (node.getOutputs().size() != 1)
213 const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
214 loc_index = only_out_operand.getUses().list().front();
215 /* verify, that next node is neither beginning nor ending node of a branch*/
216 const auto &next_node = _graph->operations().at(loc_index);
217 if (!isMergable(*_graph, next_node))
224 std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const graph::Graph &graph)
227 VERBOSE(HEScheduler::schedule) << "task scheduling started" << std::endl;
228 // Make ranks and save in descending order
231 for (const auto *backend : _all_backends)
233 _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
236 const bool is_profiling = util::getConfigBool(util::config::PROFILING_MODE);
239 // Check if profiling info about all backend/node pairs already exists
240 bool all_nodes_are_profiled = true;
241 _graph->operations().iterate([&](const model::OperationIndex &, const model::Operation &op) {
242 if (all_nodes_are_profiled)
243 all_nodes_are_profiled = isNodeProfiled(op);
246 // If all nodes are already profiled - schedule backends in such order, so more profiling
247 // information about between-backends data transfer could be collected
248 if (all_nodes_are_profiled)
250 scheduleShufflingBackends();
251 VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
252 return std::move(_backend_resolver);
256 model::OperationIndexMap<bool> visited;
257 graph.operations().iterate([&](const model::OperationIndex &index, const model::Operation &) {
258 visited[index] = false;
260 // for each task select the backend with the smallest earliest finishing time(eft)
261 for (const auto &rank : _rank_to_op)
263 scheduleBranch(rank.second, visited);
265 VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
266 return std::move(_backend_resolver);
269 int64_t HEScheduler::getOpTime(const backend::Backend *backend, const std::string &operation,
270 bool quant, uint32_t size)
272 const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size);
273 if (time != _exec_time->NOT_FOUND)
276 return _is_supported.at(backend).at(operation) ? 1 : _exec_time->getMax();
279 int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
280 const backend::Backend *dst_backend, bool quant, uint32_t size)
282 const auto time = _exec_time->getPermuteTime(src_backend, dst_backend, quant, size);
283 if (time != _exec_time->NOT_FOUND)
286 // Makes the scheduler prefer keeping computations on one backend
290 int64_t HEScheduler::tryBackend(const model::Operation &node, const backend::Backend *backend)
292 // if there is no profiling info don't use this backend during scheduling
293 if (!util::getConfigBool(util::config::PROFILING_MODE))
295 VERBOSE(HEScheduler::tryBackend)
296 << "Trying to HE schedule while there is no profiling info for " << node.name()
297 << " on backend " << backend->config()->id() << ". So this backend won't be used. "
299 _is_supported[backend][node.name()] = false;
300 return _exec_time->getMax();
302 auto iter = _is_supported.find(backend);
303 if (iter != _is_supported.end())
305 auto it2 = iter->second.find(node.name());
306 if (it2 != iter->second.end())
308 return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
313 _backend_resolver->getBackendContext(backend)->shape_fixer->fix(node);
315 _is_supported[backend][node.name()] = true;
317 catch (std::runtime_error &e)
319 _is_supported[backend][node.name()] = false;
321 return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
324 void HEScheduler::makeRank()
326 VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
328 _graph->operations().iterate(
329 [&](const model::OperationIndex &index, const model::Operation &) { DFSMaxRank(index); });
331 // Check that ranks are calculated for all operations(nodes)
332 _graph->operations().iterate([&](const model::OperationIndex &index, const model::Operation &) {
333 UNUSED_RELEASE(index);
334 assert(_op_to_rank->find(index) != _op_to_rank->end());
336 VERBOSE(HEScheduler::makeRank) << "task prioritizing finished" << std::endl;
339 int64_t HEScheduler::DFSMaxRank(const model::OperationIndex &index)
341 auto op_to_rank_it = _op_to_rank->find(index);
342 if (op_to_rank_it != _op_to_rank->end())
343 return op_to_rank_it->second;
345 const auto &node = _graph->operations().at(index);
347 const bool quant = isQuant(*_graph, node);
348 const auto size = getOperationsFlattenedIOSize(*_graph, node);
349 auto supported_backends_quantity = static_cast<int64_t>(_all_backends.size());
351 const auto max_child_rank = DFSChildrenMaxRank(index);
353 // get average exec time of this op
354 for (const auto &backend : _all_backends)
356 auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
357 if (exec_time == _exec_time->NOT_FOUND)
359 exec_time = tryBackend(node, backend);
361 if (exec_time < _exec_time->getMax())
367 // this operation isn't supported in this backend
368 --supported_backends_quantity;
371 if (supported_backends_quantity == 0)
373 throw std::runtime_error{"Encountered unsupported op: " + node.name()};
375 rank /= supported_backends_quantity;
377 // get standard deviation
379 for (const auto backend : _all_backends)
381 const auto exec_time = getOpTime(backend, node.name(), quant, size);
382 if (exec_time < _exec_time->getMax())
384 std += (exec_time - rank) * (exec_time - rank);
387 std /= supported_backends_quantity;
390 std = static_cast<int>(std::sqrt(std));
393 rank += max_child_rank;
396 _rank_to_op.emplace(rank, index);
397 _op_to_rank->emplace(index, rank);
398 VERBOSE(HEScheduler::DFSMaxRank) << "rank of operation (" << index.value() << ")" << node.name()
399 << " is " << rank << std::endl;
404 int64_t HEScheduler::DFSChildrenMaxRank(const model::OperationIndex &index)
406 const auto &node = _graph->operations().at(index);
407 int64_t max_child_rank = 0;
408 for (const auto &output : node.getOutputs())
410 const auto &operand = _graph->operands().at(output);
411 const bool quant = operand.typeInfo().type() == ir::DataType::QUANT8_ASYMM;
412 // average data transfer cost of this operand's data
413 int64_t avg_transfer_cost = 1;
414 for (const auto *backend : _all_backends)
416 for (const auto *other_backend : _all_backends)
418 if (backend == other_backend)
423 _exec_time->getPermuteTime(backend, other_backend, quant, operand.info().total_size());
424 if (transfer_cost == _exec_time->NOT_FOUND)
426 // Makes the scheduler prefer keeping computations on one backend
427 transfer_cost = operand.info().total_size() / 100;
429 avg_transfer_cost += transfer_cost;
432 avg_transfer_cost /= _all_backends.size();
433 for (const auto &use : operand.getUses().list())
435 const auto cur_child_rank = DFSMaxRank(use);
436 max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost);
439 return max_child_rank;
442 int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
443 const int64_t &starting_time, const int64_t &time_amount)
445 const auto backend_times = _backends_avail_time.at(backend);
446 // finishing and starting times of an op, that will come after current op
447 auto next_op_fst = backend_times.upper_bound(starting_time);
448 // finishing time of an op, that will come before current op
449 auto prev_op_ft = starting_time;
450 // until reach the "hole/gap", that is enough to run this op
451 while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft <= time_amount)
453 prev_op_ft = next_op_fst->first + 1;
459 bool HEScheduler::schedule(const model::OperationIndex &index,
460 const backend::Backend *parent_backend)
462 VERBOSE(HEScheduler::schedule) << "scheduling (" << index.value() << ")" << std::endl;
463 int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
464 const auto &node = _graph->operations().at(index);
466 std::multimap<int64_t, int64_t> selected_transfer_st_exec_time;
467 // select the backend with the smallest eft of this task
468 const backend::Backend *chosen_backend = nullptr;
469 for (const auto *backend : _all_backends)
471 std::multimap<int64_t, int64_t> transfer_st_exec_time;
472 const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time);
474 if (eft > est_and_et.first + est_and_et.second)
476 eft = est_and_et.first + est_and_et.second;
477 selected_exec_time = est_and_et.second;
478 chosen_backend = backend;
479 selected_transfer_st_exec_time = transfer_st_exec_time;
483 if (chosen_backend == nullptr)
485 throw std::runtime_error{"Fail to choose backend on scheduler"};
488 // this is part of a branch and it is assigned another backend
489 if (parent_backend && parent_backend != chosen_backend)
493 for (const auto &it : selected_transfer_st_exec_time)
495 auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
496 _backends_avail_time[_cpu_backend].insert({prev_op_ft + it.second, prev_op_ft});
499 _ops_eft[index] = eft;
500 _backends_avail_time[chosen_backend].emplace(eft, eft - selected_exec_time);
501 _backend_resolver->setBackend(index, chosen_backend);
503 VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
504 << chosen_backend->config()->id() << ". Its eft: " << eft
509 std::pair<int64_t, int64_t>
510 HEScheduler::ESTAndExecTime(const backend::Backend *backend, const model::OperationIndex &index,
511 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
513 const bool is_linear_exec = "Linear" == util::getConfigString(util::config::EXECUTOR);
514 const bool is_parallel_exec = "Parallel" == util::getConfigString(util::config::EXECUTOR);
515 // Permutation will cause creating a separate subgraph that contains just this permutation node.
516 // This isn't needed for Linear executor since it doesn't use subgraphs
517 // Number 1 ms is picked experimentally
518 int64_t permute_fine = 1000;
519 // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with
520 // permutation on other branches or non-nnfw specific tasks and have to wait for it.
521 // Number 2 is picked experimentally
522 const int64_t CPU_DELAY = 2;
523 const auto &node = _graph->operations().at(index);
524 const bool quant = isQuant(*_graph, node);
525 const auto size = getOperationsFlattenedIOSize(*_graph, node);
526 // if this node can be part of a subgraph, then assigning different backend will cause creating
528 if (isMergable(*_graph, node))
532 if (isWorkaroundSkip(*_graph, backend, node, quant))
534 return {_exec_time->getMax(), _exec_time->getMax()};
536 // get average exec time of the op on this backend
537 auto exec_time = getOpTime(backend, node.name(), quant, size);
538 if (backend->config()->id() == "cpu" && is_parallel_exec)
540 exec_time *= CPU_DELAY;
543 // get max eft of direct (one level above) predecessors
544 auto max_pred_eft = predMaxEFT(backend, node, transfer_st_exec_time);
546 int64_t total_transfer_cost = 0;
547 std::vector<std::multimap<int64_t, int64_t>::iterator> inserted_permutations;
548 // Find free time for data transferring and insert it into backend taskset. This is needed:
549 // 1. Time for multiple permutations for this node's input is found correctly
550 // 2. If backend==cpu, then free time for this node must come after permutations
551 for (auto &it : transfer_st_exec_time)
553 if (is_parallel_exec)
555 it.second *= CPU_DELAY;
559 it.second += permute_fine;
561 total_transfer_cost += it.second;
563 const auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
565 max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second);
567 const auto tmp = _backends_avail_time[_cpu_backend].emplace(prev_op_ft + it.second, prev_op_ft);
568 inserted_permutations.push_back(tmp.first);
570 // find the hole/gap, where this op can be put or the finishing time of the last assigned op
571 auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
573 // Remove inserted permutation from cpu's task set
574 for (const auto &it : inserted_permutations)
576 _backends_avail_time[_cpu_backend].erase(it);
579 /* In case non-parallel executor measure just exec time and data transfer time
580 * because EFT(prev_op_ft) is the same for all backends. Since two operations
581 * can't be run simultaneously, finish of running operation must be waited for.
582 * When an operation starts, all backends are free. So, they need time just for
584 if (!is_parallel_exec)
586 VERBOSE(HEScheduler::ESTAndExecTime)
587 << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
588 << backend->config()->id() << " is " << exec_time
589 << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
591 return {total_transfer_cost, exec_time};
593 VERBOSE(HEScheduler::ESTAndExecTime)
594 << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
595 << backend->config()->id() << ": " << exec_time
596 << " microseconds. Backend available time: " << prev_op_ft
597 << " Parent's max eft: " << max_pred_eft - total_transfer_cost
598 << " data transfer cost: " << total_transfer_cost << std::endl;
600 return {prev_op_ft, exec_time};
603 int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const model::Operation &node,
604 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
606 int64_t max_pred_eft = 0;
607 for (const auto &input_operand_idx : node.getInputs())
609 const auto &input_operand = _graph->operands().at(input_operand_idx);
610 const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT8_ASYMM;
612 for (const auto &input_node_idx : input_operand.getDef().list())
614 // Data transfer cost from parent's node backend to current node's backend:
615 auto parent_backend = _backend_resolver->getBackend(input_node_idx);
617 max_pred_eft = std::max(max_pred_eft, _ops_eft.at(input_node_idx));
618 if (parent_backend != backend)
620 // Multiply operand size by 2 because size must describe input+output size
621 int64_t transfer_cost =
622 getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
623 transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
630 } // namespace compiler
632 } // namespace neurun