From 9b62ff26b718566c8994ccd7ba49726e3d54160e Mon Sep 17 00:00:00 2001 From: =?utf8?q?=D0=94=D0=B8=D0=BB=D1=88=D0=BE=D0=B4=D0=B6=D0=BE=D0=BD=20?= =?utf8?q?=D0=A3=D0=BC=D1=80=D0=BE=D0=BD=D1=85=D0=BE=D0=BD=D0=BE=D0=B2?= =?utf8?q?=D0=B8=D1=87=20=D0=9F=D0=BE=D1=88=D1=88=D0=BE=D0=B5=D0=B2/AI=20T?= =?utf8?q?ools=20Lab=20/SRR/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Thu, 25 Jul 2019 14:29:02 +0900 Subject: [PATCH] Code refactoring Scheduler except makeRank method (#5822) Split Scheduler::scheduleNode into smaller ones, created const Graph *_graph, did const some temp variables Signed-off-by: Dilshodzhon Poshshoev --- runtimes/neurun/core/src/compiler/Scheduler.cc | 268 +++++++++++++------------ runtimes/neurun/core/src/compiler/Scheduler.h | 39 +++- 2 files changed, 178 insertions(+), 129 deletions(-) diff --git a/runtimes/neurun/core/src/compiler/Scheduler.cc b/runtimes/neurun/core/src/compiler/Scheduler.cc index 13f23db..363e839 100644 --- a/runtimes/neurun/core/src/compiler/Scheduler.cc +++ b/runtimes/neurun/core/src/compiler/Scheduler.cc @@ -100,7 +100,7 @@ static bool isWorkaroundSkip(const graph::Graph &graph, const backend::Backend * return false; } -void Scheduler::scheduleShufflingBackends(const graph::Graph &graph) +void Scheduler::scheduleShufflingBackends() { const auto all_backends = _backend_resolver->getAllBackends(); VERBOSE(Scheduler::scheduleNode) @@ -110,9 +110,9 @@ void Scheduler::scheduleShufflingBackends(const graph::Graph &graph) for (const auto &rank : _ranks) { VERBOSE(Scheduler::scheduleNode) << "scheduling (" << rank.second.value() << ")" << std::endl; - const auto &node = graph.operations().at(rank.second); - bool quant = isQuant(graph, node); - auto size = getOperationsFlattenedIOSize(graph, node); + const auto &node = _graph->operations().at(rank.second); + const bool quant = isQuant(*_graph, node); + const auto size = getOperationsFlattenedIOSize(*_graph, node); for (size_t i = 0;; ++i) { if (i == all_backends.size()) @@ -125,12 +125,12 @@ void Scheduler::scheduleShufflingBackends(const graph::Graph &graph) { backend_ind = 0; } - if (isWorkaroundSkip(graph, all_backends[backend_ind], node, quant)) + if (isWorkaroundSkip(*_graph, all_backends[backend_ind], node, quant)) { ++backend_ind; continue; } - auto exec_time = + const auto exec_time = _exec_time->getOperationExecTime(all_backends[backend_ind], node.getName(), quant, size); // Scheduling to measure data transfer must be done after measuring all backends seperately assert(exec_time != _exec_time->NOT_FOUND); @@ -150,9 +150,10 @@ void Scheduler::scheduleShufflingBackends(const graph::Graph &graph) std::unique_ptr Scheduler::schedule(const graph::Graph &graph) { + _graph = &graph; VERBOSE(Scheduler::schedule) << "task scheduling started" << std::endl; // Make ranks and save in descending order - makeRank(graph); + makeRank(); const auto all_backends = _backend_resolver->getAllBackends(); for (const auto *backend : all_backends) @@ -160,16 +161,16 @@ std::unique_ptr Scheduler::schedule(const graph::Grap _backends_avail_time.emplace(backend, std::map{{0, 0}}); } - bool is_profiling = util::getConfigBool(util::config::PROFILING_MODE); + const bool is_profiling = util::getConfigBool(util::config::PROFILING_MODE); if (is_profiling) { bool profile_data_transfer = true; - const auto &node = graph.operations().at(_ranks.begin()->second); - bool quant = isQuant(graph, node); - auto size = getOperationsFlattenedIOSize(graph, node); + const auto &node = _graph->operations().at(_ranks.begin()->second); + const bool quant = isQuant(*_graph, node); + const auto size = getOperationsFlattenedIOSize(*_graph, node); for (const auto *backend : all_backends) { - auto exec_time = _exec_time->getOperationExecTime(backend, node.getName(), quant, size); + const auto exec_time = _exec_time->getOperationExecTime(backend, node.getName(), quant, size); if (exec_time == _exec_time->NOT_FOUND) { profile_data_transfer = false; @@ -178,7 +179,7 @@ std::unique_ptr Scheduler::schedule(const graph::Grap } if (profile_data_transfer) { - scheduleShufflingBackends(graph); + scheduleShufflingBackends(); VERBOSE(Scheduler::schedule) << "task scheduling finished" << std::endl; return nnfw::cpp14::make_unique(*_backend_resolver); } @@ -187,7 +188,7 @@ std::unique_ptr Scheduler::schedule(const graph::Grap // for each task select the backend with the smallest earliest finishing time(eft) for (const auto &rank : _ranks) { - scheduleNode(graph, rank.second); + scheduleNode(rank.second); } VERBOSE(Scheduler::schedule) << "task scheduling finished" << std::endl; return nnfw::cpp14::make_unique(*_backend_resolver); @@ -196,7 +197,7 @@ std::unique_ptr Scheduler::schedule(const graph::Grap int64_t Scheduler::getTime(const backend::Backend *backend, const std::string &operation, bool quant, uint32_t size) { - auto time = _exec_time->getOperationExecTime(backend, operation, quant, size); + const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size); if (time != _exec_time->NOT_FOUND) { return time; @@ -230,7 +231,7 @@ int64_t Scheduler::tryBackend(const model::Operation &node, const backend::Backe } // TODO: Too large function: need code refactoring -void Scheduler::makeRank(const graph::Graph &model) +void Scheduler::makeRank() { VERBOSE(Scheduler::makeRank) << "task prioritizing" << std::endl; @@ -239,7 +240,7 @@ void Scheduler::makeRank(const graph::Graph &model) std::unordered_map calculated_rank; const int NOT_SET = -1; - model.operations().iterate([&](const model::OperationIndex &index, const model::Operation &) { + _graph->operations().iterate([&](const model::OperationIndex &index, const model::Operation &) { calculated_rank[index] = NOT_SET; }); @@ -247,7 +248,7 @@ void Scheduler::makeRank(const graph::Graph &model) [&](const model::OperationIndex &index, const model::Operation &node) -> int64_t { int64_t rank = 0, max_child_rank = 0, std = 0; - bool quant = isQuant(model, node); + const bool quant = isQuant(*_graph, node); if (calculated_rank.at(index) >= 0) return calculated_rank.at(index); @@ -255,7 +256,7 @@ void Scheduler::makeRank(const graph::Graph &model) // get children's max rank for (const auto &output : node.getOutputs()) { - const auto &operand = model.operands().at(output); + const auto &operand = _graph->operands().at(output); // avarage data transfer cost of this operand's data int64_t avg_transfer_cost = 1; for (const auto *backend : all_backends) @@ -282,12 +283,12 @@ void Scheduler::makeRank(const graph::Graph &model) avg_transfer_cost /= all_backends.size(); for (const auto &use : operand.getUses().list()) { - auto cur_child_rank = dfs_recursive(use, model.operations().at(use)); + const auto cur_child_rank = dfs_recursive(use, _graph->operations().at(use)); max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost); } } - auto size = getOperationsFlattenedIOSize(model, node); + const auto size = getOperationsFlattenedIOSize(*_graph, node); auto all_backends_size = static_cast(all_backends.size()); // get average exec time of this op for (const auto &backend : all_backends) @@ -313,7 +314,7 @@ void Scheduler::makeRank(const graph::Graph &model) // get standard deviation for (const auto backend : all_backends) { - auto exec_time = getTime(backend, node.getName(), quant, size); + const auto exec_time = getTime(backend, node.getName(), quant, size); if (exec_time < _exec_time->getMax()) { std += (exec_time - rank) * (exec_time - rank); @@ -338,7 +339,7 @@ void Scheduler::makeRank(const graph::Graph &model) }; for (const auto &it : calculated_rank) { - const auto &node = model.operations().at(it.first); + const auto &node = _graph->operations().at(it.first); dfs_recursive(it.first, node); } @@ -352,7 +353,7 @@ void Scheduler::makeRank(const graph::Graph &model) int64_t Scheduler::backendAvailableTime(const backend::Backend *backend, const int64_t &starting_time, const int64_t &time_amount) { - auto backend_times = _backends_avail_time.at(backend); + const auto backend_times = _backends_avail_time.at(backend); // finishing and starting times of an op, that will come after current op auto next_op_fst = backend_times.upper_bound(starting_time); // finishing time of an op, that will come before current op @@ -366,131 +367,148 @@ int64_t Scheduler::backendAvailableTime(const backend::Backend *backend, return prev_op_ft; } -// TODO: Too large function: need code refactoring -void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIndex &index) +void Scheduler::scheduleNode(const model::OperationIndex &index) { - const bool is_linear_exec = "Linear" == util::getConfigString(util::config::EXECUTOR); - // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with - // permutation on other branches or non-nnfw specific tasks and have to wait for it. - // Number 2 is picked experimentally - const auto CPU_DELAY = 2; VERBOSE(Scheduler::scheduleNode) << "scheduling (" << index.value() << ")" << std::endl; + const auto all_backends = _backend_resolver->getAllBackends(); int64_t eft = std::numeric_limits::max(), selected_exec_time = 0; - const auto &node = model.operations().at(index); - const auto &obj = model.operands().at(node.getInputs().at(0)); - bool quant = isQuant(model, node); + const auto &node = _graph->operations().at(index); - auto size = getOperationsFlattenedIOSize(model, node); - std::multimap selected_permute_avail_time; + std::multimap selected_transfer_st_exec_time; // select the backend with the smallest eft of this task const backend::Backend *chosen_backend = nullptr; for (const auto *backend : all_backends) { - std::multimap permute_avail_time; - if (isWorkaroundSkip(model, backend, node, quant)) - { - continue; - } - // get average exec time of the op on this backend - auto exec_time = getTime(backend, node.getName(), quant, size); - if (backend->config()->id() == "cpu" && !is_linear_exec) - { - exec_time *= CPU_DELAY; - } - // get max eft of direct (one lavel above) predecessors - int64_t max_pred_eft = 0; - int64_t total_transfer_cost = 0; - for (const auto &input : node.getInputs()) - { - const auto &operand = model.operands().at(input); - // operations, whose output is current node's this input operand - for (const auto &defs : operand.getDef().list()) - { - // Data transfer cost from this parent's backend to current node's backend: - auto parent_backend = _backend_resolver->getBackend(defs); - - if (parent_backend == backend) - { - max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs)); - } - else - { - // Multiply obj size by 2 because size my discribe input+output size - int64_t transfer_cost = - getTime(parent_backend, backend->config()->id(), quant, obj.info().total_size() * 2); - if (!is_linear_exec) - { - transfer_cost *= CPU_DELAY; - } - total_transfer_cost += transfer_cost; - permute_avail_time.insert({_parents_eft.at(defs), transfer_cost}); - } - } - } - - std::vector::iterator> inserted_permutations; - // Find free time for data transfering and insert it into backend taskset. This is needed: - // 1. Time for multiple permutations for this node's input is found correctly - // 2. If backend==cpu, then free time for this node must come after permutations - for (const auto &it : permute_avail_time) - { - auto prev_op_ft = - backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second); - max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second); - - auto tmp = _backends_avail_time[_backend_resolver->getBackend("cpu")].insert( - {prev_op_ft + it.second, prev_op_ft}); - inserted_permutations.push_back(tmp.first); - } - // find the hole/gap, where this op can be put or the finishing time of the last assigned op - auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time); + std::multimap transfer_st_exec_time; + const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time); - // Remove inserted permutation from cpu's task set - for (const auto &it : inserted_permutations) - { - _backends_avail_time[_backend_resolver->getBackend("cpu")].erase(it); - } - // In case linear executor measure just exec time and data transfer time - if (is_linear_exec) - { - prev_op_ft = total_transfer_cost; - VERBOSE(Scheduler::scheduleNode) - << "exec_time of (" << index.value() << ") " << node.getName() << " quant==" << quant - << " on " << backend->config()->id() << " is " << exec_time - << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl; - } - else + if (eft > est_and_et.first + est_and_et.second) { - VERBOSE(Scheduler::scheduleNode) - << "exec_time of (" << index.value() << ") " << node.getName() << " quant==" << quant - << " on " << backend->config()->id() << ": " << exec_time - << " microseconds. Backend available time: " << prev_op_ft - << " Parent's max eft: " << max_pred_eft - total_transfer_cost - << " data transfer cost: " << total_transfer_cost << std::endl; - } - if (eft > prev_op_ft + exec_time) - { - eft = prev_op_ft + exec_time; - selected_exec_time = exec_time; + eft = est_and_et.first + est_and_et.second; + selected_exec_time = est_and_et.second; chosen_backend = backend; - selected_permute_avail_time = permute_avail_time; + selected_transfer_st_exec_time = transfer_st_exec_time; } } - for (const auto &it : selected_permute_avail_time) + + for (const auto &it : selected_transfer_st_exec_time) { auto prev_op_ft = backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second); _backends_avail_time[_backend_resolver->getBackend("cpu")].insert( {prev_op_ft + it.second, prev_op_ft}); } - VERBOSE(Scheduler::scheduleNode) << "backend for " << node.getName() << " is " - << chosen_backend->config()->id() << ". Its eft: " << eft - << std::endl; _parents_eft[index] = eft; _backends_avail_time[chosen_backend].insert({eft, eft - selected_exec_time}); _backend_resolver->setBackend(index, chosen_backend); + + VERBOSE(Scheduler::scheduleNode) << "backend for " << node.getName() << " is " + << chosen_backend->config()->id() << ". Its eft: " << eft + << std::endl; +} + +std::pair +Scheduler::ESTAndExecTime(const backend::Backend *backend, const model::OperationIndex &index, + std::multimap &transfer_st_exec_time) +{ + const bool is_linear_exec = "Linear" == util::getConfigString(util::config::EXECUTOR); + // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with + // permutation on other branches or non-nnfw specific tasks and have to wait for it. + // Number 2 is picked experimentally + const int64_t CPU_DELAY = 2; + const auto &node = _graph->operations().at(index); + const bool quant = isQuant(*_graph, node); + const auto size = getOperationsFlattenedIOSize(*_graph, node); + if (isWorkaroundSkip(*_graph, backend, node, quant)) + { + return {_exec_time->getMax(), _exec_time->getMax()}; + } + // get average exec time of the op on this backend + auto exec_time = getTime(backend, node.getName(), quant, size); + if (backend->config()->id() == "cpu" && !is_linear_exec) + { + exec_time *= CPU_DELAY; + } + + // get max eft of direct (one lavel above) predecessors + auto max_pred_eft = predMaxEFT(backend, node, quant, transfer_st_exec_time); + + int64_t total_transfer_cost = 0; + std::vector::iterator> inserted_permutations; + // Find free time for data transfering and insert it into backend taskset. This is needed: + // 1. Time for multiple permutations for this node's input is found correctly + // 2. If backend==cpu, then free time for this node must come after permutations + for (auto &it : transfer_st_exec_time) + { + if (!is_linear_exec) + { + it.second *= CPU_DELAY; + } + total_transfer_cost += it.second; + + const auto prev_op_ft = + backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second); + + max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second); + + const auto tmp = _backends_avail_time[_backend_resolver->getBackend("cpu")].insert( + {prev_op_ft + it.second, prev_op_ft}); + inserted_permutations.push_back(tmp.first); + } + // find the hole/gap, where this op can be put or the finishing time of the last assigned op + auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time); + + // Remove inserted permutation from cpu's task set + for (const auto &it : inserted_permutations) + { + _backends_avail_time[_backend_resolver->getBackend("cpu")].erase(it); + } + + // In case linear executor measure just exec time and data transfer time + if (is_linear_exec) + { + VERBOSE(Scheduler::scheduleNode) + << "exec_time of (" << index.value() << ") " << node.getName() << " quant==" << quant + << " on " << backend->config()->id() << " is " << exec_time + << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl; + return {total_transfer_cost, exec_time}; + } + VERBOSE(Scheduler::scheduleNode) << "exec_time of (" << index.value() << ") " << node.getName() + << " quant==" << quant << " on " << backend->config()->id() + << ": " << exec_time + << " microseconds. Backend available time: " << prev_op_ft + << " Parent's max eft: " << max_pred_eft - total_transfer_cost + << " data transfer cost: " << total_transfer_cost << std::endl; + + return {prev_op_ft, exec_time}; +} + +int64_t Scheduler::predMaxEFT(const backend::Backend *backend, const model::Operation &node, + bool quant, std::multimap &transfer_st_exec_time) +{ + int64_t max_pred_eft = 0; + for (const auto &input : node.getInputs()) + { + const auto &operand = _graph->operands().at(input); + // operations, whose output is current node's this input operand + for (const auto &defs : operand.getDef().list()) + { + // Data transfer cost from this parent's backend to current node's backend: + auto parent_backend = _backend_resolver->getBackend(defs); + + max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs)); + if (parent_backend != backend) + { + // Multiply operand size by 2 because size must discribe input+output size + int64_t transfer_cost = getTime(parent_backend, backend->config()->id(), quant, + operand.info().total_size() * 2); + transfer_st_exec_time.insert({_parents_eft.at(defs), transfer_cost}); + } + } + } + return max_pred_eft; } } // namespace compiler diff --git a/runtimes/neurun/core/src/compiler/Scheduler.h b/runtimes/neurun/core/src/compiler/Scheduler.h index 37e9248..7026bf6 100644 --- a/runtimes/neurun/core/src/compiler/Scheduler.h +++ b/runtimes/neurun/core/src/compiler/Scheduler.h @@ -65,11 +65,40 @@ public: * @note The main idea is taken from HSIP algo: * https://www.hindawi.com/journals/sp/2016/3676149/ */ - std::unique_ptr schedule(const graph::Graph &model) final; + std::unique_ptr schedule(const graph::Graph &graph) final; std::shared_ptr getIndexedRanks() { return _indexed_ranks; } private: - void makeRank(const graph::Graph &model); + void scheduleNode(const model::OperationIndex &); + /** + * @brief Get earliest starting time and execution time of an operation on a backend. + * + * @note Returns a time when operation's inputs are ready and backend is available + * It also returns exec time. If this is "cpu" backend, then exec_time*CPU_DELAY + * + * @param[in] backend: backend, for which to return the time + * @param[in] index: index of an operation + * @param[out] transfer_st_exec_time: est and exec time of data tranfer operation + * + * @return earliest starting tme and execution time + */ + std::pair + ESTAndExecTime(const backend::Backend *backend, const model::OperationIndex &index, + std::multimap &transfer_st_exec_time); + /** + * @brief Returns the latest finishing time of parents of a node. + * + * @param[in] backend: backend, for which to return the time + * @param[in] node: node to get eft of parents + * @param[in] quant: if input data is quantized + * @param[out] transfer_st_exec_time: est and exec time of data tranfer operation + * + * @return earliest finishing time of parent nodes + */ + int64_t predMaxEFT(const backend::Backend *backend, const model::Operation &node, bool quant, + std::multimap &transfer_st_exec_time); + + void makeRank(); /** * @brief Returns the time, when backend is available for at least given amount of time. * @@ -79,6 +108,8 @@ private: * @param[in] backend backend, for which to return the time * @param[in] starting_time time, starting which to look for gap * @param[in] time_amount amound of the time, for which to look gap + * + * @return time, when backend has at least time_amount free time */ int64_t backendAvailableTime(const backend::Backend *backend, const int64_t &starting_time, const int64_t &time_amount); @@ -86,8 +117,7 @@ private: int64_t getTime(const backend::Backend *backend, const std::string &operation, bool quant, uint32_t size); - void scheduleNode(const graph::Graph &model, const model::OperationIndex &); - void scheduleShufflingBackends(const graph::Graph &graph); + void scheduleShufflingBackends(); int64_t tryBackend(const model::Operation &node, const backend::Backend *backend); @@ -100,6 +130,7 @@ private: std::shared_ptr _indexed_ranks; std::unique_ptr _backend_resolver; std::unique_ptr _exec_time; + const graph::Graph *_graph{nullptr}; }; } // namespace compiler -- 2.7.4