From bfdab3c2c5cf449f4ec3dcbec005ca2036d9bdca Mon Sep 17 00:00:00 2001 From: =?utf8?q?=D0=94=D0=B8=D0=BB=D1=88=D0=BE=D0=B4=D0=B6=D0=BE=D0=BD=20?= =?utf8?q?=D0=A3=D0=BC=D1=80=D0=BE=D0=BD=D1=85=D0=BE=D0=BD=D0=BE=D0=B2?= =?utf8?q?=D0=B8=D1=87=20=D0=9F=D0=BE=D1=88=D1=88=D0=BE=D0=B5=D0=B2/AI=20T?= =?utf8?q?ools=20Lab=20/SRR/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Tue, 16 Jul 2019 19:23:04 +0900 Subject: [PATCH] [neurun] Schedule ops in holes/gaps of a backends (#5557) So far scheduler assumed a backend is occupied totally and there is no gaps/holes between two tasks. This change adds this support. Also, this change adds the concideration of occupation of cpu bywq! permutation Signed-off-by: Poshshoev Dilshodzhon --- runtimes/neurun/core/include/compiler/Scheduler.h | 15 ++++- .../neurun/core/src/compiler/BackendResolver.h | 12 ++-- runtimes/neurun/core/src/compiler/Scheduler.cc | 74 ++++++++++++++++------ 3 files changed, 76 insertions(+), 25 deletions(-) diff --git a/runtimes/neurun/core/include/compiler/Scheduler.h b/runtimes/neurun/core/include/compiler/Scheduler.h index 3adc5ca..42ab3b3 100644 --- a/runtimes/neurun/core/include/compiler/Scheduler.h +++ b/runtimes/neurun/core/include/compiler/Scheduler.h @@ -70,6 +70,18 @@ public: private: void makeRank(const graph::Graph &model); + /** + * @brief Returns the time, when backend is available for at least given amount of time. + * + * @note Returns either hole/gap between two performing two already scheduled operations, + * or the finishing time of the last scheduled operation + * + * @param[in] backend backend, for which to return the time + * @param[in] starting_time time, starting which to look for gap + * @param[in] time_amount amound of the time, for which to look gap + */ + int64_t backendAvailableTime(const backend::Backend *backend, const int64_t &starting_time, + const int64_t &time_amount); int64_t getTime(const backend::Backend *backend, const std::string &operation, bool quant, uint32_t size); @@ -80,7 +92,8 @@ private: private: std::unordered_map> _run_cache; - std::unordered_map _backends_avail_time; + // Finishing and starting time of each backend + std::unordered_map> _backends_avail_time; std::unordered_map _parents_eft; std::multimap> _ranks; std::shared_ptr _indexed_ranks; diff --git a/runtimes/neurun/core/src/compiler/BackendResolver.h b/runtimes/neurun/core/src/compiler/BackendResolver.h index 392a204..70f0c7a 100644 --- a/runtimes/neurun/core/src/compiler/BackendResolver.h +++ b/runtimes/neurun/core/src/compiler/BackendResolver.h @@ -39,14 +39,16 @@ public: } public: - const backend::Backend *getBackend(const std::type_index &type) + const backend::Backend *getDefaultBackend() const { return _backend_manager->get("cpu"); } + + const backend::Backend *getBackend(const std::string &id) const { - return _gen_map_deprecated[type]; + return _backend_manager->get(id); } - const backend::Backend *getDefaultBackend() const + + const backend::Backend *getBackend(const std::type_index &type) { - backend::Backend *default_backend = _backend_manager->get("cpu"); - return default_backend; + return _gen_map_deprecated[type]; } const backend::Backend *getBackend(const model::OperationIndex &index) diff --git a/runtimes/neurun/core/src/compiler/Scheduler.cc b/runtimes/neurun/core/src/compiler/Scheduler.cc index 91efc29..e4700d9 100644 --- a/runtimes/neurun/core/src/compiler/Scheduler.cc +++ b/runtimes/neurun/core/src/compiler/Scheduler.cc @@ -43,7 +43,7 @@ std::unique_ptr Scheduler::schedule(const graph::Grap const auto all_backends = _backend_resolver->getAllBackends(); for (const auto *backend : all_backends) { - _backends_avail_time.emplace(backend, 0); + _backends_avail_time.emplace(backend, std::map{{0, 0}}); } // for each task select the backend with the smallest earliest finishing time(eft) @@ -154,8 +154,9 @@ void Scheduler::makeRank(const graph::Graph &model) { // Permute layer is used for data transfer and it is implemented on cpu backend assert(_backend_resolver->getDefaultBackend()->config()->id() == "cpu"); - transfer_cost = 100; // makes the scheduler prefer keeping computations on one backend - _run_cache[backend][other_backend->config()->id()] = 100; + // makes the scheduler prefer keeping computations on one backend + transfer_cost = operand.info().total_size() / 100; + _run_cache[backend][other_backend->config()->id()] = transfer_cost; } avg_transfer_cost += transfer_cost; } @@ -230,11 +231,28 @@ void Scheduler::makeRank(const graph::Graph &model) VERBOSE(Scheduler::makeRank) << "task prioritizing finished" << std::endl; } +int64_t Scheduler::backendAvailableTime(const backend::Backend *backend, + const int64_t &starting_time, const int64_t &time_amount) +{ + auto backend_times = _backends_avail_time.at(backend); + // finishing and starting times of an op, that will come after current op + auto next_op_fst = backend_times.upper_bound(starting_time); + // finishing time of an op, that will come before current op + auto prev_op_ft = starting_time; + // until reach the "hole/gap", that is enough to run this op + while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft < time_amount) + { + prev_op_ft = next_op_fst->first; + ++next_op_fst; + } + return prev_op_ft; +} + void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIndex &index) { VERBOSE(Scheduler::scheduleNode) << "scheduling (" << index.value() << ")" << std::endl; const auto all_backends = _backend_resolver->getAllBackends(); - int64_t eft = std::numeric_limits::max(); + int64_t eft = std::numeric_limits::max(), selected_exec_time = 0; const auto &node = model.operations().at(index); const auto &obj = model.operands().at(node.getInputs().at(0)); bool quant = false; @@ -245,11 +263,13 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn } auto size = getOperationsFlattenedIOSize(model, node); - + std::vector> selected_permute_avail_time; // select the backend with the smallest eft of this task const backend::Backend *chosen_backend = nullptr; for (const auto *backend : all_backends) { + std::vector> permute_avail_time; + /* TODO: this is workaround, come up with better solution if have. Adding exception in stage doesn't help. Because if there is a record for add without broadcast, scheduling will select it since it doesn't distinguish broadcast and @@ -286,12 +306,9 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn } // get average exec time of the op on this backend auto exec_time = getTime(backend, node.getName(), quant, size); - VERBOSE(Scheduler::scheduleNode) << "expected exec_time of (" << index.value() << ") " - << node.getName() << " quant==" << quant << " on " - << backend->config()->id() << " is " << exec_time - << " microseconds" << std::endl; // get max eft of direct (one lavel above) predecessors int64_t max_pred_eft = 0; + int64_t total_transfer_cost = 0; for (const auto &input : node.getInputs()) { const auto &operand = model.operands().at(input); @@ -301,34 +318,53 @@ void Scheduler::scheduleNode(const graph::Graph &model, const model::OperationIn // Data transfer cost from this parent's backend to current node's backend: auto parent_backend = _backend_resolver->getBackend(defs); - int64_t transfer_cost; if (parent_backend == backend) { - transfer_cost = 1; + max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs)); } else { - transfer_cost = + int64_t transfer_cost = getTime(parent_backend, backend->config()->id(), quant, obj.info().total_size()); + total_transfer_cost += transfer_cost; + + // Assumes that cpu is free and proceeds to permutation right away + permute_avail_time.push_back({transfer_cost, _parents_eft.at(defs)}); + max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs)); } - max_pred_eft = std::max(max_pred_eft, _parents_eft.at(defs) + transfer_cost); } } - auto backend_avail_time = _backends_avail_time[backend]; - // earliest starting time - auto est = std::max(backend_avail_time, max_pred_eft); - if (eft > est + exec_time) + max_pred_eft += total_transfer_cost; + // find the hole/gap, where this op can be put or the finishing time of the last assigned op + auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time); + + VERBOSE(Scheduler::scheduleNode) + << "expected exec_time of (" << index.value() << ") " << node.getName() + << " quant==" << quant << " on " << backend->config()->id() << " is " << exec_time + << " microseconds. Backend available time: " << prev_op_ft + << " Parent's max eft: " << max_pred_eft - total_transfer_cost + << " total transfer cost: " << total_transfer_cost << std::endl; + if (eft > prev_op_ft + exec_time) { - eft = est + exec_time; + eft = prev_op_ft + exec_time; + selected_exec_time = exec_time; chosen_backend = backend; + selected_permute_avail_time = permute_avail_time; } } + for (const auto &it : selected_permute_avail_time) + { + auto prev_op_ft = + backendAvailableTime(_backend_resolver->getBackend("cpu"), it.first, it.second); + _backends_avail_time[_backend_resolver->getBackend("cpu")].insert( + {prev_op_ft + it.second, prev_op_ft}); + } VERBOSE(Scheduler::schedule) << "backend for " << node.getName() << " is " << chosen_backend->config()->id() << ". Its eft: " << eft << std::endl; _parents_eft[index] = eft; - _backends_avail_time[chosen_backend] = eft + 1; + _backends_avail_time[chosen_backend].insert({eft, eft - selected_exec_time}); _backend_resolver->setBackend(index, chosen_backend); } -- 2.7.4