runtime/onert/core/src/compiler/HEScheduler.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "ir/Operand.h"
  18 #include "compiler/HEScheduler.h"
  19 #include "ir/Graph.h"
  20 #include "util/ConfigSource.h"
  21 #include "compiler/BackendResolver.h"
  22 #include "util/logging.h"
  23 #include "util/Utils.h"
  24 #include "exec/FunctionSequence.h"
  25 #include <cassert>
  26 #include <cmath>
  27 #include <chrono>
  28
  29 namespace onert
  30 {
  31
  32 namespace compiler
  33 {
  34 static uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::Operation &node)
  35 {
  36   uint32_t size = 0;
  37   for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
  38   {
  39     size += graph.operands().at(ind).info().total_size();
  40   }
  41   return size;
  42 }
  43
  44 static bool isQuant(const ir::Graph &graph, const ir::Operation &node)
  45 {
  46   for (const auto &input : node.getInputs() | ir::Remove::UNDEFINED)
  47   {
  48     const auto &obj = graph.operands().at(input);
  49     if (obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM)
  50     {
  51       return true;
  52     }
  53   }
  54   return false;
  55 }
  56
  57 static bool isWorkaroundSkip(const ir::Graph &, const backend::Backend *, const ir::Operation &,
  58                              bool)
  59 {
  60   // Now, there is no workaround
  61   return false;
  62 }
  63
  64 // if a node can be merged into op_seq
  65 static bool isMergeable(const ir::Graph &graph, const ir::Operation &node)
  66 {
  67   size_t prev_op_cnt = 0;
  68   for (const auto &input : node.getInputs())
  69   {
  70     // only valid_inputs
  71     const auto &operand = graph.operands().at(input);
  72     if (operand.isConstant())
  73       continue;
  74
  75     // This operand is output of operation, not weight or bias
  76     if (operand.getDef().valid())
  77       ++prev_op_cnt;
  78
  79     // Current node has multiple inputs as concat or at the beginning of the separated branch
  80     if (prev_op_cnt > 1 || operand.getUses().size() > 1)
  81     {
  82       return false;
  83     }
  84   }
  85   return true;
  86 }
  87
  88 void HEScheduler::scheduleShufflingBackends()
  89 {
  90   VERBOSE(HEScheduler::schedule)
  91       << "Started task scheduling: uses all backends to get more metrics for data transfer"
  92       << std::endl;
  93   size_t backend_ind = 0;
  94   for (const auto &rank : _rank_to_op)
  95   {
  96     VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second.value() << ")" << std::endl;
  97     const auto &node = _graph->operations().at(rank.second);
  98     const bool quant = isQuant(*_graph, node);
  99     const auto size = getOperationsFlattenedIOSize(*_graph, node);
 100     for (size_t i = 0;; ++i)
 101     {
 102       if (i == _all_backends.size())
 103       {
 104         // wasn't able to find backend
 105         assert(false);
 106         break;
 107       }
 108       if (backend_ind == _all_backends.size())
 109       {
 110         backend_ind = 0;
 111       }
 112       if (isWorkaroundSkip(*_graph, _all_backends[backend_ind], node, quant))
 113       {
 114         ++backend_ind;
 115         continue;
 116       }
 117       const auto exec_time =
 118           _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
 119       // Scheduling to measure data transfer must be done after measuring all backends separately
 120       assert(exec_time != _exec_time->NOT_FOUND);
 121       if (exec_time == _exec_time->getMax())
 122       {
 123         ++backend_ind;
 124         continue;
 125       }
 126       _backend_resolver->setBackend(rank.second, _all_backends[backend_ind]);
 127       VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
 128                                      << _all_backends[backend_ind]->config()->id() << std::endl;
 129       ++backend_ind;
 130       break;
 131     }
 132   }
 133 }
 134
 135 bool HEScheduler::isNodeProfiled(const ir::Operation &node)
 136 {
 137   const bool quant = isQuant(*_graph, node);
 138   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 139   for (const auto *backend : _all_backends)
 140   {
 141     const auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
 142     if (exec_time == _exec_time->NOT_FOUND)
 143       return false;
 144   }
 145   return true;
 146 }
 147
 148 void HEScheduler::scheduleBranch(const ir::OperationIndex &index,
 149                                  ir::OperationIndexMap<bool> &scheduled)
 150 {
 151   auto loc_index = index;
 152   const backend::Backend *parent_backend = nullptr;
 153   while (true)
 154   {
 155     if (scheduled[loc_index])
 156     {
 157       return;
 158     }
 159     if (!schedule(loc_index, parent_backend))
 160     {
 161       return;
 162     }
 163     scheduled[loc_index] = true;
 164     parent_backend = _backend_resolver->getBackend(loc_index);
 165
 166     const auto &node = _graph->operations().at(loc_index);
 167     /* get the only output operand, that is input of the next single operation
 168      *   and just this nodes output.*/
 169     if (node.getOutputs().size() != 1)
 170     {
 171       return;
 172     }
 173     const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
 174     // One of the last nodes
 175     if (only_out_operand.getUses().size() == 0)
 176     {
 177       return;
 178     }
 179     loc_index = *only_out_operand.getUses().begin();
 180     /* verify, that next node is neither beginning nor ending node of a branch*/
 181     const auto &next_node = _graph->operations().at(loc_index);
 182     if (!isMergeable(*_graph, next_node))
 183     {
 184       return;
 185     }
 186   }
 187 }
 188
 189 std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const ir::Graph &graph)
 190 {
 191   _graph = &graph;
 192   VERBOSE(HEScheduler::schedule) << "task scheduling started" << std::endl;
 193   // Make ranks and save in descending order
 194   makeRank();
 195
 196   for (const auto *backend : _all_backends)
 197   {
 198     _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
 199   }
 200
 201   if (_is_profiling_mode)
 202   {
 203     // Check if profiling info about all backend/node pairs already exists
 204     bool all_nodes_are_profiled = true;
 205     _graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) {
 206       if (all_nodes_are_profiled)
 207         all_nodes_are_profiled = isNodeProfiled(op);
 208     });
 209
 210     // If all nodes are already profiled - schedule backends in such order, so more profiling
 211     // information about between-backends data transfer could be collected
 212     if (all_nodes_are_profiled)
 213     {
 214       scheduleShufflingBackends();
 215       VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
 216       return std::move(_backend_resolver);
 217     }
 218   }
 219
 220   ir::OperationIndexMap<bool> visited;
 221   graph.operations().iterate(
 222       [&](const ir::OperationIndex &index, const ir::Operation &) { visited[index] = false; });
 223   // for each task select the backend with the smallest earliest finishing time(eft)
 224   for (const auto &rank : _rank_to_op)
 225   {
 226     scheduleBranch(rank.second, visited);
 227   }
 228   VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
 229   return std::move(_backend_resolver);
 230 }
 231
 232 int64_t HEScheduler::getOpTime(const backend::Backend *backend, const std::string &operation,
 233                                bool quant, uint32_t size)
 234 {
 235   const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size);
 236   if (time != _exec_time->NOT_FOUND)
 237     return time;
 238
 239   return _is_supported.at(backend).at(operation) ? 1 : _exec_time->getMax();
 240 }
 241
 242 int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
 243                                     const backend::Backend *dst_backend, bool quant, uint32_t size)
 244 {
 245   // TODO Change it to getOperationExecTime()
 246   const auto time = _exec_time->getPermuteTime(src_backend, dst_backend, quant, size);
 247
 248   if (time != _exec_time->NOT_FOUND)
 249     return time;
 250
 251   // Makes the scheduler prefer keeping computations on one backend
 252   return size / 200;
 253 }
 254
 255 int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backend *backend)
 256 {
 257   // if there is no profiling info don't use this backend during scheduling
 258   if (!_is_profiling_mode)
 259   {
 260     VERBOSE(HEScheduler::tryBackend)
 261         << "Trying to HE schedule while there is no profiling info for " << node.name()
 262         << " on backend " << backend->config()->id() << ". So this backend won't be used. "
 263         << std::endl;
 264     _is_supported[backend][node.name()] = false;
 265     return _exec_time->getMax();
 266   }
 267   auto iter = _is_supported.find(backend);
 268   if (iter != _is_supported.end())
 269   {
 270     auto it2 = iter->second.find(node.name());
 271     if (it2 != iter->second.end())
 272     {
 273       return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
 274     }
 275   }
 276   try
 277   {
 278     // DO NOTHING
 279
 280     _is_supported[backend][node.name()] = true;
 281   }
 282   catch (std::runtime_error &e)
 283   {
 284     _is_supported[backend][node.name()] = false;
 285   }
 286   return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
 287 }
 288
 289 void HEScheduler::makeRank()
 290 {
 291   VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
 292
 293   _graph->operations().iterate(
 294       [&](const ir::OperationIndex &index, const ir::Operation &) { DFSMaxRank(index); });
 295
 296   // Check that ranks are calculated for all operations(nodes)
 297   _graph->operations().iterate([&](const ir::OperationIndex &index, const ir::Operation &) {
 298     UNUSED_RELEASE(index);
 299     assert(_op_to_rank->find(index) != _op_to_rank->end());
 300   });
 301   VERBOSE(HEScheduler::makeRank) << "task prioritizing finished" << std::endl;
 302 }
 303
 304 int64_t HEScheduler::DFSMaxRank(const ir::OperationIndex &index)
 305 {
 306   auto op_to_rank_it = _op_to_rank->find(index);
 307   if (op_to_rank_it != _op_to_rank->end())
 308     return op_to_rank_it->second;
 309
 310   const auto &node = _graph->operations().at(index);
 311   int64_t rank = 0;
 312   const bool quant = isQuant(*_graph, node);
 313   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 314   auto supported_backends_quantity = static_cast<int64_t>(_all_backends.size());
 315
 316   const auto max_child_rank = DFSChildrenMaxRank(index);
 317
 318   // get average exec time of this op
 319   for (const auto &backend : _all_backends)
 320   {
 321     auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
 322     if (exec_time == _exec_time->NOT_FOUND)
 323     {
 324       exec_time = tryBackend(node, backend);
 325     }
 326     if (exec_time < _exec_time->getMax())
 327     {
 328       rank += exec_time;
 329     }
 330     else
 331     {
 332       // this operation isn't supported in this backend
 333       --supported_backends_quantity;
 334     }
 335   }
 336   if (supported_backends_quantity == 0)
 337   {
 338     throw std::runtime_error{"Encountered unsupported op: " + node.name()};
 339   }
 340   rank /= supported_backends_quantity;
 341
 342   // get standard deviation
 343   int64_t std = 0;
 344   for (const auto backend : _all_backends)
 345   {
 346     const auto exec_time = getOpTime(backend, node.name(), quant, size);
 347     if (exec_time < _exec_time->getMax())
 348     {
 349       std += (exec_time - rank) * (exec_time - rank);
 350     }
 351   }
 352   std /= supported_backends_quantity;
 353   if (std > 0)
 354   {
 355     std = static_cast<int>(std::sqrt(std));
 356     rank *= std;
 357   }
 358   rank += max_child_rank;
 359
 360   assert(rank >= 0);
 361   _rank_to_op.emplace(rank, index);
 362   _op_to_rank->emplace(index, rank);
 363   VERBOSE(HEScheduler::DFSMaxRank) << "rank of operation (" << index.value() << ")" << node.name()
 364                                    << " is " << rank << std::endl;
 365
 366   return rank;
 367 }
 368
 369 int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
 370 {
 371   const auto &node = _graph->operations().at(index);
 372   int64_t max_child_rank = 0;
 373   for (const auto &output : node.getOutputs())
 374   {
 375     const auto &operand = _graph->operands().at(output);
 376     const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
 377     // average data transfer cost of this operand's data
 378     int64_t avg_transfer_cost = 1;
 379     for (const auto *backend : _all_backends)
 380     {
 381       for (const auto *other_backend : _all_backends)
 382       {
 383         if (backend == other_backend)
 384         {
 385           continue;
 386         }
 387         // TODO Change it to controlflow backend
 388         auto transfer_cost =
 389             getPermuteTime(backend, other_backend, quant, operand.info().total_size());
 390         avg_transfer_cost += transfer_cost;
 391       }
 392     }
 393     avg_transfer_cost /= _all_backends.size();
 394     for (const auto &use : operand.getUses())
 395     {
 396       const auto cur_child_rank = DFSMaxRank(use);
 397       max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost);
 398     }
 399   }
 400   return max_child_rank;
 401 }
 402
 403 int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
 404                                           const int64_t &starting_time, const int64_t &time_amount)
 405 {
 406   const auto backend_times = _backends_avail_time.at(backend);
 407   // finishing and starting times of an op, that will come after current op
 408   auto next_op_fst = backend_times.upper_bound(starting_time);
 409   // finishing time of an op, that will come before current op
 410   auto prev_op_ft = starting_time;
 411   // until reach the "hole/gap", that is enough to run this op
 412   while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft <= time_amount)
 413   {
 414     prev_op_ft = next_op_fst->first + 1;
 415     ++next_op_fst;
 416   }
 417   return prev_op_ft;
 418 }
 419
 420 bool HEScheduler::schedule(const ir::OperationIndex &index, const backend::Backend *parent_backend)
 421 {
 422   VERBOSE(HEScheduler::schedule) << "scheduling (" << index.value() << ")" << std::endl;
 423   int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
 424   const auto &node = _graph->operations().at(index);
 425
 426   std::multimap<int64_t, int64_t> selected_transfer_st_exec_time;
 427   // select the backend with the smallest eft of this task
 428   const backend::Backend *chosen_backend = nullptr;
 429   for (const auto *backend : _all_backends)
 430   {
 431     std::multimap<int64_t, int64_t> transfer_st_exec_time;
 432     const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time);
 433
 434     if (eft > est_and_et.first + est_and_et.second)
 435     {
 436       eft = est_and_et.first + est_and_et.second;
 437       selected_exec_time = est_and_et.second;
 438       chosen_backend = backend;
 439       selected_transfer_st_exec_time = transfer_st_exec_time;
 440     }
 441   }
 442
 443   if (chosen_backend == nullptr)
 444   {
 445     throw std::runtime_error{"Fail to choose backend on scheduler"};
 446   }
 447
 448   // this is part of a branch and it is assigned another backend
 449   if (parent_backend && parent_backend != chosen_backend)
 450   {
 451     return false;
 452   }
 453   for (const auto &it : selected_transfer_st_exec_time)
 454   {
 455     auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
 456     _backends_avail_time[_cpu_backend].insert({prev_op_ft + it.second, prev_op_ft});
 457   }
 458
 459   _ops_eft[index] = eft;
 460   _backends_avail_time[chosen_backend].emplace(eft, eft - selected_exec_time);
 461   _backend_resolver->setBackend(index, chosen_backend);
 462
 463   VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
 464                                  << chosen_backend->config()->id() << ". Its eft: " << eft
 465                                  << std::endl;
 466   return true;
 467 }
 468
 469 std::pair<int64_t, int64_t>
 470 HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::OperationIndex &index,
 471                             std::multimap<int64_t, int64_t> &transfer_st_exec_time)
 472 {
 473   // Permutation will cause creating a separate op_seq that contains just this permutation node.
 474   // This isn't needed for Linear executor since it doesn't use op_seqs
 475   // Number 1 ms is picked experimentally
 476   int64_t permute_fine = 1000;
 477   // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with
 478   // permutation on other branches or non-nnfw specific tasks and have to wait for it.
 479   // Number 2 is picked experimentally
 480   const int64_t CPU_DELAY = 2;
 481   const auto &node = _graph->operations().at(index);
 482   const bool quant = isQuant(*_graph, node);
 483   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 484   // if this node can be part of a op_seq, then assigning different backend will cause creating
 485   // another op_seq
 486   if (isMergeable(*_graph, node))
 487   {
 488     permute_fine *= 2;
 489   }
 490   if (isWorkaroundSkip(*_graph, backend, node, quant))
 491   {
 492     return {_exec_time->getMax(), _exec_time->getMax()};
 493   }
 494   // get average exec time of the op on this backend
 495   auto exec_time = getOpTime(backend, node.name(), quant, size);
 496   if (backend->config()->id() == "cpu" && _is_parallel_exec)
 497   {
 498     exec_time *= CPU_DELAY;
 499   }
 500
 501   // get max eft of direct (one level above) predecessors
 502   auto max_pred_eft = predMaxEFT(backend, node, transfer_st_exec_time);
 503
 504   int64_t total_transfer_cost = 0;
 505   std::vector<std::multimap<int64_t, int64_t>::iterator> inserted_permutations;
 506   // Find free time for data transferring and insert it into backend taskset. This is needed:
 507   //  1. Time for multiple permutations for this node's input is found correctly
 508   //  2. If backend==cpu, then free time for this node must come after permutations
 509   for (auto &it : transfer_st_exec_time)
 510   {
 511     if (_is_parallel_exec)
 512     {
 513       it.second *= CPU_DELAY;
 514     }
 515     if (!_is_linear_exec)
 516     {
 517       it.second += permute_fine;
 518     }
 519     total_transfer_cost += it.second;
 520
 521     const auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
 522
 523     max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second);
 524
 525     const auto tmp = _backends_avail_time[_cpu_backend].emplace(prev_op_ft + it.second, prev_op_ft);
 526     inserted_permutations.push_back(tmp.first);
 527   }
 528   // find the hole/gap, where this op can be put or the finishing time of the last assigned op
 529   auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
 530
 531   // Remove inserted permutation from cpu's task set
 532   for (const auto &it : inserted_permutations)
 533   {
 534     _backends_avail_time[_cpu_backend].erase(it);
 535   }
 536
 537   /* In case non-parallel executor measure just exec time and data transfer time
 538    * because EFT(prev_op_ft) is the same for all backends. Since two operations
 539    * can't be run simultaneously, finish of running operation must be waited for.
 540    * When an operation starts, all backends are free. So, they need time just for
 541    * data transfer.*/
 542   if (!_is_parallel_exec)
 543   {
 544     VERBOSE(HEScheduler::ESTAndExecTime)
 545         << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
 546         << backend->config()->id() << " is " << exec_time
 547         << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
 548
 549     return {total_transfer_cost, exec_time};
 550   }
 551   VERBOSE(HEScheduler::ESTAndExecTime)
 552       << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
 553       << backend->config()->id() << ": " << exec_time
 554       << " microseconds. Backend available time: " << prev_op_ft
 555       << " Parent's max eft: " << max_pred_eft - total_transfer_cost
 556       << " data transfer cost: " << total_transfer_cost << std::endl;
 557
 558   return {prev_op_ft, exec_time};
 559 }
 560
 561 int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::Operation &node,
 562                                 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
 563 {
 564   int64_t max_pred_eft = 0;
 565   for (const auto &input_operand_idx : node.getInputs())
 566   {
 567     const auto &input_operand = _graph->operands().at(input_operand_idx);
 568     const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
 569
 570     auto input_node_idx = input_operand.getDef();
 571     if (input_node_idx.valid())
 572     {
 573       // Data transfer cost from parent's node backend to current node's backend:
 574       auto parent_backend = _backend_resolver->getBackend(input_node_idx);
 575
 576       max_pred_eft = std::max(max_pred_eft, _ops_eft.at(input_node_idx));
 577       if (parent_backend != backend)
 578       {
 579         // Multiply operand size by 2 because size must describe input+output size
 580         int64_t transfer_cost =
 581             getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
 582         transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
 583       }
 584     }
 585   }
 586   return max_pred_eft;
 587 }
 588
 589 } // namespace compiler
 590
 591 } // namespace onert