2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "HEScheduler.h"
19 #include "compiler/BackendResolver.h"
21 #include "util/logging.h"
29 using namespace onert;
31 uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::IOperation &node)
34 for (const auto &ind :
35 (node.getInputs() | ir::Remove::UNDEFINED) + (node.getOutputs() | ir::Remove::UNDEFINED))
37 size += graph.operands().at(ind).info().total_size();
42 bool isQuant(const ir::Graph &graph, const ir::IOperation &node)
44 for (const auto &input : node.getInputs() | ir::Remove::UNDEFINED)
46 const auto &obj = graph.operands().at(input);
47 if (obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM)
55 bool isWorkaroundSkip(const ir::Graph &, const backend::Backend *, const ir::IOperation &, bool)
57 // Now, there is no workaround
61 // if a node can be merged into op_seq
62 bool isMergeable(const ir::Graph &graph, const ir::IOperation &node)
64 size_t prev_op_cnt = 0;
65 for (const auto &input : node.getInputs() | ir::Remove::UNDEFINED)
68 const auto &operand = graph.operands().at(input);
69 if (operand.isConstant())
72 // This operand is output of operation, not weight or bias
73 if (operand.getDef().valid())
76 // Current node has multiple inputs as concat or at the beginning of the separated branch
77 if (prev_op_cnt > 1 || operand.getUses().size() > 1)
93 void HEScheduler::scheduleShufflingBackends()
95 VERBOSE(HEScheduler::schedule)
96 << "Started task scheduling: uses all backends to get more metrics for data transfer"
98 size_t backend_ind = 0;
99 for (const auto &rank : _rank_to_op)
101 VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second << ")" << std::endl;
102 const auto &node = _graph->operations().at(rank.second);
103 const bool quant = isQuant(*_graph, node);
104 const auto size = getOperationsFlattenedIOSize(*_graph, node);
105 for (size_t i = 0;; ++i)
107 if (i == _all_backends.size())
109 // wasn't able to find backend
113 if (backend_ind == _all_backends.size())
117 if (isWorkaroundSkip(*_graph, _all_backends[backend_ind], node, quant))
122 const auto exec_time =
123 _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
124 // Scheduling to measure data transfer must be done after measuring all backends separately
125 assert(exec_time != _exec_time->NOT_FOUND);
126 if (exec_time == _exec_time->getMax())
131 _backend_resolver->setBackend(rank.second, _all_backends[backend_ind]);
132 VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
133 << _all_backends[backend_ind]->config()->id() << std::endl;
140 bool HEScheduler::isNodeProfiled(const ir::IOperation &node)
142 const bool quant = isQuant(*_graph, node);
143 const auto size = getOperationsFlattenedIOSize(*_graph, node);
144 for (const auto *backend : _all_backends)
146 const auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
147 if (exec_time == _exec_time->NOT_FOUND)
153 void HEScheduler::scheduleBranch(const ir::OperationIndex &index,
154 ir::OperationIndexMap<bool> &scheduled)
156 auto loc_index = index;
157 const backend::Backend *parent_backend = nullptr;
160 if (scheduled[loc_index])
164 if (!schedule(loc_index, parent_backend))
168 scheduled[loc_index] = true;
169 parent_backend = _backend_resolver->getBackend(loc_index);
171 const auto &node = _graph->operations().at(loc_index);
172 /* get the only output operand, that is input of the next single operation
173 * and just this nodes output.*/
174 if (node.getOutputs().size() != 1)
178 const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
179 // One of the last nodes
180 if (only_out_operand.getUses().size() == 0)
184 loc_index = *only_out_operand.getUses().begin();
185 /* verify, that next node is neither beginning nor ending node of a branch*/
186 const auto &next_node = _graph->operations().at(loc_index);
187 if (!isMergeable(*_graph, next_node))
194 std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const ir::Graph &graph)
197 VERBOSE(HEScheduler::schedule) << "task scheduling started" << std::endl;
198 // Make ranks and save in descending order
201 for (const auto *backend : _all_backends)
203 _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
206 if (_is_profiling_mode)
208 // Check if profiling info about all backend/node pairs already exists
209 bool all_nodes_are_profiled = true;
210 _graph->operations().iterate([&](const ir::OperationIndex &, const ir::IOperation &op) {
211 if (all_nodes_are_profiled)
212 all_nodes_are_profiled = isNodeProfiled(op);
215 // If all nodes are already profiled - schedule backends in such order, so more profiling
216 // information about between-backends data transfer could be collected
217 if (all_nodes_are_profiled)
219 scheduleShufflingBackends();
220 VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
221 return std::move(_backend_resolver);
225 ir::OperationIndexMap<bool> visited;
226 graph.operations().iterate(
227 [&](const ir::OperationIndex &index, const ir::IOperation &) { visited[index] = false; });
228 // for each task select the backend with the smallest earliest finishing time(eft)
229 for (const auto &rank : _rank_to_op)
231 scheduleBranch(rank.second, visited);
233 VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
234 return std::move(_backend_resolver);
237 int64_t HEScheduler::getOpTime(const backend::Backend *backend, const std::string &operation,
238 bool quant, uint32_t size)
240 const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size);
241 if (time != _exec_time->NOT_FOUND)
244 return _is_supported.at(backend).at(operation) ? 1 : _exec_time->getMax();
247 int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
248 const backend::Backend *dst_backend, bool quant, uint32_t size)
250 // TODO Change it to getOperationExecTime()
251 const auto time = _exec_time->getPermuteTime(src_backend, dst_backend, quant, size);
253 if (time != _exec_time->NOT_FOUND)
256 // FIXME permute time is not recorded so the control reaches here always
257 // Makes the scheduler prefer keeping computations on one backend
261 int64_t HEScheduler::tryBackend(const ir::IOperation &node, const backend::Backend *backend)
263 // if there is no profiling info don't use this backend during scheduling
264 if (!_is_profiling_mode)
266 VERBOSE(HEScheduler::tryBackend)
267 << "Trying to HE schedule while there is no profiling info for " << node.name()
268 << " on backend " << backend->config()->id() << ". So this backend won't be used. "
270 _is_supported[backend][node.name()] = false;
271 return _exec_time->getMax();
273 auto iter = _is_supported.find(backend);
274 if (iter != _is_supported.end())
276 auto it2 = iter->second.find(node.name());
277 if (it2 != iter->second.end())
279 return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
286 _is_supported[backend][node.name()] = true;
288 catch (std::runtime_error &e)
290 _is_supported[backend][node.name()] = false;
292 return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
295 void HEScheduler::makeRank()
297 VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
299 _graph->operations().iterate(
300 [&](const ir::OperationIndex &index, const ir::IOperation &) { DFSMaxRank(index); });
302 // Check that ranks are calculated for all operations(nodes)
303 _graph->operations().iterate([&](const ir::OperationIndex &index, const ir::IOperation &) {
304 UNUSED_RELEASE(index);
305 assert(_op_to_rank->find(index) != _op_to_rank->end());
307 VERBOSE(HEScheduler::makeRank) << "task prioritizing finished" << std::endl;
310 int64_t HEScheduler::DFSMaxRank(const ir::OperationIndex &index)
312 auto op_to_rank_it = _op_to_rank->find(index);
313 if (op_to_rank_it != _op_to_rank->end())
314 return op_to_rank_it->second;
316 const auto &node = _graph->operations().at(index);
318 const bool quant = isQuant(*_graph, node);
319 const auto size = getOperationsFlattenedIOSize(*_graph, node);
320 auto supported_backends_quantity = static_cast<int64_t>(_all_backends.size());
322 const auto max_child_rank = DFSChildrenMaxRank(index);
324 // get average exec time of this op
325 for (const auto &backend : _all_backends)
327 auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
328 if (exec_time == _exec_time->NOT_FOUND)
330 exec_time = tryBackend(node, backend);
332 if (exec_time < _exec_time->getMax())
338 // this operation isn't supported in this backend
339 --supported_backends_quantity;
342 if (supported_backends_quantity == 0)
344 throw std::runtime_error{"Encountered unsupported op: " + node.name()};
346 rank /= supported_backends_quantity;
348 // get standard deviation
350 for (const auto backend : _all_backends)
352 const auto exec_time = getOpTime(backend, node.name(), quant, size);
353 if (exec_time < _exec_time->getMax())
355 std += (exec_time - rank) * (exec_time - rank);
358 std /= supported_backends_quantity;
361 std = static_cast<int>(std::sqrt(std));
364 rank += max_child_rank;
367 _rank_to_op.emplace(rank, index);
368 _op_to_rank->emplace(index, rank);
369 VERBOSE(HEScheduler::DFSMaxRank)
370 << "rank of operation (" << index << ")" << node.name() << " is " << rank << std::endl;
375 int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
377 const auto &node = _graph->operations().at(index);
378 int64_t max_child_rank = 0;
379 for (const auto &output : node.getOutputs() | ir::Remove::UNDEFINED)
381 const auto &operand = _graph->operands().at(output);
382 const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
383 // average data transfer cost of this operand's data
384 int64_t avg_transfer_cost = 1;
385 for (const auto *backend : _all_backends)
387 for (const auto *other_backend : _all_backends)
389 if (backend == other_backend)
393 // TODO Change it to builtin backend
395 getPermuteTime(backend, other_backend, quant, operand.info().total_size());
396 avg_transfer_cost += transfer_cost;
399 avg_transfer_cost /= _all_backends.size();
400 for (const auto &use : operand.getUses())
402 const auto cur_child_rank = DFSMaxRank(use);
403 max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost);
406 return max_child_rank;
409 int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
410 const int64_t &starting_time, const int64_t &time_amount)
412 const auto backend_times = _backends_avail_time.at(backend);
413 // finishing and starting times of an op, that will come after current op
414 auto next_op_fst = backend_times.upper_bound(starting_time);
415 // finishing time of an op, that will come before current op
416 auto prev_op_ft = starting_time;
417 // until reach the "hole/gap", that is enough to run this op
418 while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft <= time_amount)
420 prev_op_ft = next_op_fst->first + 1;
426 bool HEScheduler::schedule(const ir::OperationIndex &index, const backend::Backend *parent_backend)
428 VERBOSE(HEScheduler::schedule) << "scheduling (" << index << ")" << std::endl;
429 int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
430 const auto &node = _graph->operations().at(index);
432 std::multimap<int64_t, int64_t> selected_transfer_st_exec_time;
433 // select the backend with the smallest eft of this task
434 const backend::Backend *chosen_backend = nullptr;
435 for (const auto *backend : _all_backends)
437 std::multimap<int64_t, int64_t> transfer_st_exec_time;
438 const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time);
440 if (eft > est_and_et.first + est_and_et.second)
442 eft = est_and_et.first + est_and_et.second;
443 selected_exec_time = est_and_et.second;
444 chosen_backend = backend;
445 selected_transfer_st_exec_time = transfer_st_exec_time;
449 if (chosen_backend == nullptr)
451 throw std::runtime_error{"Fail to choose backend on scheduler"};
454 // this is part of a branch and it is assigned another backend
455 if (parent_backend && parent_backend != chosen_backend)
459 for (const auto &it : selected_transfer_st_exec_time)
461 auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
462 _backends_avail_time[_cpu_backend].insert({prev_op_ft + it.second, prev_op_ft});
465 _ops_eft[index] = eft;
466 _backends_avail_time[chosen_backend].emplace(eft, eft - selected_exec_time);
467 _backend_resolver->setBackend(index, chosen_backend);
469 VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
470 << chosen_backend->config()->id() << ". Its eft: " << eft
475 std::pair<int64_t, int64_t>
476 HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::OperationIndex &index,
477 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
479 // Permutation will cause creating a separate op_seq that contains just this permutation node.
480 // This isn't needed for Linear executor since it doesn't use op_seqs
481 // Number 1 ms is picked experimentally
482 int64_t permute_fine = 1000;
483 // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with
484 // permutation on other branches or non-nnfw specific tasks and have to wait for it.
485 // Number 2 is picked experimentally
486 const int64_t CPU_DELAY = 2;
487 const auto &node = _graph->operations().at(index);
488 const bool quant = isQuant(*_graph, node);
489 const auto size = getOperationsFlattenedIOSize(*_graph, node);
490 // if this node can be part of a op_seq, then assigning different backend will cause creating
492 if (isMergeable(*_graph, node))
496 if (isWorkaroundSkip(*_graph, backend, node, quant))
498 return {_exec_time->getMax(), _exec_time->getMax()};
500 // get average exec time of the op on this backend
501 auto exec_time = getOpTime(backend, node.name(), quant, size);
502 if (backend->config()->id() == "cpu" && _is_parallel_exec)
504 exec_time *= CPU_DELAY;
507 // get max eft of direct (one level above) predecessors
508 auto max_pred_eft = predMaxEFT(backend, node, transfer_st_exec_time);
510 int64_t total_transfer_cost = 0;
511 std::vector<std::multimap<int64_t, int64_t>::iterator> inserted_permutations;
512 // Find free time for data transferring and insert it into backend taskset. This is needed:
513 // 1. Time for multiple permutations for this node's input is found correctly
514 // 2. If backend==cpu, then free time for this node must come after permutations
515 for (auto &&it : transfer_st_exec_time)
517 if (_is_parallel_exec)
519 it.second *= CPU_DELAY;
521 if (!_is_linear_exec)
523 it.second += permute_fine;
525 total_transfer_cost += it.second;
527 const auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
529 max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second);
531 const auto tmp = _backends_avail_time[_cpu_backend].emplace(prev_op_ft + it.second, prev_op_ft);
532 inserted_permutations.push_back(tmp.first);
534 // find the hole/gap, where this op can be put or the finishing time of the last assigned op
535 auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
537 // Remove inserted permutation from cpu's task set
538 for (const auto &it : inserted_permutations)
540 _backends_avail_time[_cpu_backend].erase(it);
543 /* In case non-parallel executor measure just exec time and data transfer time
544 * because EFT(prev_op_ft) is the same for all backends. Since two operations
545 * can't be run simultaneously, finish of running operation must be waited for.
546 * When an operation starts, all backends are free. So, they need time just for
548 if (!_is_parallel_exec)
550 VERBOSE(HEScheduler::ESTAndExecTime)
551 << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
552 << backend->config()->id() << " is " << exec_time
553 << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
555 return {total_transfer_cost, exec_time};
557 VERBOSE(HEScheduler::ESTAndExecTime)
558 << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
559 << backend->config()->id() << ": " << exec_time
560 << " microseconds. Backend available time: " << prev_op_ft
561 << " Parent's max eft: " << max_pred_eft - total_transfer_cost
562 << " data transfer cost: " << total_transfer_cost << std::endl;
564 return {prev_op_ft, exec_time};
567 int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::IOperation &node,
568 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
570 int64_t max_pred_eft = 0;
571 for (const auto &input_operand_idx : node.getInputs() | ir::Remove::UNDEFINED)
573 const auto &input_operand = _graph->operands().at(input_operand_idx);
574 const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
576 auto input_node_idx = input_operand.getDef();
577 if (input_node_idx.valid())
579 // Data transfer cost from parent's node backend to current node's backend:
580 auto parent_backend = _backend_resolver->getBackend(input_node_idx);
582 max_pred_eft = std::max(max_pred_eft, _ops_eft.at(input_node_idx));
583 if (parent_backend != backend)
585 // Multiply operand size by 2 because size must describe input+output size
586 int64_t transfer_cost =
587 getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
588 transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
595 } // namespace compiler