runtime/onert/core/src/compiler/HEScheduler.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "HEScheduler.h"
  18
  19 #include "compiler/BackendResolver.h"
  20 #include "ir/Graph.h"
  21 #include "util/logging.h"
  22
  23 #include <cassert>
  24 #include <cmath>
  25
  26 namespace
  27 {
  28
  29 using namespace onert;
  30
  31 uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::IOperation &node)
  32 {
  33   uint32_t size = 0;
  34   for (const auto &ind :
  35        (node.getInputs() | ir::Remove::UNDEFINED) + (node.getOutputs() | ir::Remove::UNDEFINED))
  36   {
  37     size += graph.operands().at(ind).info().total_size();
  38   }
  39   return size;
  40 }
  41
  42 bool isQuant(const ir::Graph &graph, const ir::IOperation &node)
  43 {
  44   for (const auto &input : node.getInputs() | ir::Remove::UNDEFINED)
  45   {
  46     const auto &obj = graph.operands().at(input);
  47     if (obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM)
  48     {
  49       return true;
  50     }
  51   }
  52   return false;
  53 }
  54
  55 bool isWorkaroundSkip(const ir::Graph &, const backend::Backend *, const ir::IOperation &, bool)
  56 {
  57   // Now, there is no workaround
  58   return false;
  59 }
  60
  61 // if a node can be merged into op_seq
  62 bool isMergeable(const ir::Graph &graph, const ir::IOperation &node)
  63 {
  64   size_t prev_op_cnt = 0;
  65   for (const auto &input : node.getInputs() | ir::Remove::UNDEFINED)
  66   {
  67     // only valid_inputs
  68     const auto &operand = graph.operands().at(input);
  69     if (operand.isConstant())
  70       continue;
  71
  72     // This operand is output of operation, not weight or bias
  73     if (operand.getDef().valid())
  74       ++prev_op_cnt;
  75
  76     // Current node has multiple inputs as concat or at the beginning of the separated branch
  77     if (prev_op_cnt > 1 || operand.getUses().size() > 1)
  78     {
  79       return false;
  80     }
  81   }
  82   return true;
  83 }
  84
  85 } // namespace
  86
  87 namespace onert
  88 {
  89
  90 namespace compiler
  91 {
  92
  93 void HEScheduler::scheduleShufflingBackends()
  94 {
  95   VERBOSE(HEScheduler::schedule)
  96     << "Started task scheduling: uses all backends to get more metrics for data transfer"
  97     << std::endl;
  98   size_t backend_ind = 0;
  99   for (const auto &rank : _rank_to_op)
 100   {
 101     VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second << ")" << std::endl;
 102     const auto &node = _graph->operations().at(rank.second);
 103     const bool quant = isQuant(*_graph, node);
 104     const auto size = getOperationsFlattenedIOSize(*_graph, node);
 105     for (size_t i = 0;; ++i)
 106     {
 107       if (i == _all_backends.size())
 108       {
 109         // wasn't able to find backend
 110         assert(false);
 111         break;
 112       }
 113       if (backend_ind == _all_backends.size())
 114       {
 115         backend_ind = 0;
 116       }
 117       if (isWorkaroundSkip(*_graph, _all_backends[backend_ind], node, quant))
 118       {
 119         ++backend_ind;
 120         continue;
 121       }
 122       const auto exec_time =
 123         _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
 124       // Scheduling to measure data transfer must be done after measuring all backends separately
 125       assert(exec_time != _exec_time->NOT_FOUND);
 126       if (exec_time == _exec_time->getMax())
 127       {
 128         ++backend_ind;
 129         continue;
 130       }
 131       _backend_resolver->setBackend(rank.second, _all_backends[backend_ind]);
 132       VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
 133                                      << _all_backends[backend_ind]->config()->id() << std::endl;
 134       ++backend_ind;
 135       break;
 136     }
 137   }
 138 }
 139
 140 bool HEScheduler::isNodeProfiled(const ir::IOperation &node)
 141 {
 142   const bool quant = isQuant(*_graph, node);
 143   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 144   for (const auto *backend : _all_backends)
 145   {
 146     const auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
 147     if (exec_time == _exec_time->NOT_FOUND)
 148       return false;
 149   }
 150   return true;
 151 }
 152
 153 void HEScheduler::scheduleBranch(const ir::OperationIndex &index,
 154                                  ir::OperationIndexMap<bool> &scheduled)
 155 {
 156   auto loc_index = index;
 157   const backend::Backend *parent_backend = nullptr;
 158   while (true)
 159   {
 160     if (scheduled[loc_index])
 161     {
 162       return;
 163     }
 164     if (!schedule(loc_index, parent_backend))
 165     {
 166       return;
 167     }
 168     scheduled[loc_index] = true;
 169     parent_backend = _backend_resolver->getBackend(loc_index);
 170
 171     const auto &node = _graph->operations().at(loc_index);
 172     /* get the only output operand, that is input of the next single operation
 173      *   and just this nodes output.*/
 174     if (node.getOutputs().size() != 1)
 175     {
 176       return;
 177     }
 178     const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
 179     // One of the last nodes
 180     if (only_out_operand.getUses().size() == 0)
 181     {
 182       return;
 183     }
 184     loc_index = *only_out_operand.getUses().begin();
 185     /* verify, that next node is neither beginning nor ending node of a branch*/
 186     const auto &next_node = _graph->operations().at(loc_index);
 187     if (!isMergeable(*_graph, next_node))
 188     {
 189       return;
 190     }
 191   }
 192 }
 193
 194 std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const ir::Graph &graph)
 195 {
 196   _graph = &graph;
 197   VERBOSE(HEScheduler::schedule) << "task scheduling started" << std::endl;
 198   // Make ranks and save in descending order
 199   makeRank();
 200
 201   for (const auto *backend : _all_backends)
 202   {
 203     _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
 204   }
 205
 206   if (_is_profiling_mode)
 207   {
 208     // Check if profiling info about all backend/node pairs already exists
 209     bool all_nodes_are_profiled = true;
 210     _graph->operations().iterate([&](const ir::OperationIndex &, const ir::IOperation &op) {
 211       if (all_nodes_are_profiled)
 212         all_nodes_are_profiled = isNodeProfiled(op);
 213     });
 214
 215     // If all nodes are already profiled - schedule backends in such order, so more profiling
 216     // information about between-backends data transfer could be collected
 217     if (all_nodes_are_profiled)
 218     {
 219       scheduleShufflingBackends();
 220       VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
 221       return std::move(_backend_resolver);
 222     }
 223   }
 224
 225   ir::OperationIndexMap<bool> visited;
 226   graph.operations().iterate(
 227     [&](const ir::OperationIndex &index, const ir::IOperation &) { visited[index] = false; });
 228   // for each task select the backend with the smallest earliest finishing time(eft)
 229   for (const auto &rank : _rank_to_op)
 230   {
 231     scheduleBranch(rank.second, visited);
 232   }
 233   VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
 234   return std::move(_backend_resolver);
 235 }
 236
 237 int64_t HEScheduler::getOpTime(const backend::Backend *backend, const std::string &operation,
 238                                bool quant, uint32_t size)
 239 {
 240   const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size);
 241   if (time != _exec_time->NOT_FOUND)
 242     return time;
 243
 244   return _is_supported.at(backend).at(operation) ? 1 : _exec_time->getMax();
 245 }
 246
 247 int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
 248                                     const backend::Backend *dst_backend, bool quant, uint32_t size)
 249 {
 250   // TODO Change it to getOperationExecTime()
 251   const auto time = _exec_time->getPermuteTime(src_backend, dst_backend, quant, size);
 252
 253   if (time != _exec_time->NOT_FOUND)
 254     return time;
 255
 256   // FIXME permute time is not recorded so the control reaches here always
 257   // Makes the scheduler prefer keeping computations on one backend
 258   return size / 400;
 259 }
 260
 261 int64_t HEScheduler::tryBackend(const ir::IOperation &node, const backend::Backend *backend)
 262 {
 263   // if there is no profiling info don't use this backend during scheduling
 264   if (!_is_profiling_mode)
 265   {
 266     VERBOSE(HEScheduler::tryBackend)
 267       << "Trying to HE schedule while there is no profiling info for " << node.name()
 268       << " on backend " << backend->config()->id() << ". So this backend won't be used. "
 269       << std::endl;
 270     _is_supported[backend][node.name()] = false;
 271     return _exec_time->getMax();
 272   }
 273   auto iter = _is_supported.find(backend);
 274   if (iter != _is_supported.end())
 275   {
 276     auto it2 = iter->second.find(node.name());
 277     if (it2 != iter->second.end())
 278     {
 279       return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
 280     }
 281   }
 282   try
 283   {
 284     // DO NOTHING
 285
 286     _is_supported[backend][node.name()] = true;
 287   }
 288   catch (std::runtime_error &e)
 289   {
 290     _is_supported[backend][node.name()] = false;
 291   }
 292   return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
 293 }
 294
 295 void HEScheduler::makeRank()
 296 {
 297   VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
 298
 299   _graph->operations().iterate(
 300     [&](const ir::OperationIndex &index, const ir::IOperation &) { DFSMaxRank(index); });
 301
 302   // Check that ranks are calculated for all operations(nodes)
 303   _graph->operations().iterate([&](const ir::OperationIndex &index, const ir::IOperation &) {
 304     UNUSED_RELEASE(index);
 305     assert(_op_to_rank->find(index) != _op_to_rank->end());
 306   });
 307   VERBOSE(HEScheduler::makeRank) << "task prioritizing finished" << std::endl;
 308 }
 309
 310 int64_t HEScheduler::DFSMaxRank(const ir::OperationIndex &index)
 311 {
 312   auto op_to_rank_it = _op_to_rank->find(index);
 313   if (op_to_rank_it != _op_to_rank->end())
 314     return op_to_rank_it->second;
 315
 316   const auto &node = _graph->operations().at(index);
 317   int64_t rank = 0;
 318   const bool quant = isQuant(*_graph, node);
 319   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 320   auto supported_backends_quantity = static_cast<int64_t>(_all_backends.size());
 321
 322   const auto max_child_rank = DFSChildrenMaxRank(index);
 323
 324   // get average exec time of this op
 325   for (const auto &backend : _all_backends)
 326   {
 327     auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
 328     if (exec_time == _exec_time->NOT_FOUND)
 329     {
 330       exec_time = tryBackend(node, backend);
 331     }
 332     if (exec_time < _exec_time->getMax())
 333     {
 334       rank += exec_time;
 335     }
 336     else
 337     {
 338       // this operation isn't supported in this backend
 339       --supported_backends_quantity;
 340     }
 341   }
 342   if (supported_backends_quantity == 0)
 343   {
 344     throw std::runtime_error{"Encountered unsupported op: " + node.name()};
 345   }
 346   rank /= supported_backends_quantity;
 347
 348   // get standard deviation
 349   int64_t std = 0;
 350   for (const auto backend : _all_backends)
 351   {
 352     const auto exec_time = getOpTime(backend, node.name(), quant, size);
 353     if (exec_time < _exec_time->getMax())
 354     {
 355       std += (exec_time - rank) * (exec_time - rank);
 356     }
 357   }
 358   std /= supported_backends_quantity;
 359   if (std > 0)
 360   {
 361     std = static_cast<int>(std::sqrt(std));
 362     rank *= std;
 363   }
 364   rank += max_child_rank;
 365
 366   assert(rank >= 0);
 367   _rank_to_op.emplace(rank, index);
 368   _op_to_rank->emplace(index, rank);
 369   VERBOSE(HEScheduler::DFSMaxRank)
 370     << "rank of operation (" << index << ")" << node.name() << " is " << rank << std::endl;
 371
 372   return rank;
 373 }
 374
 375 int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
 376 {
 377   const auto &node = _graph->operations().at(index);
 378   int64_t max_child_rank = 0;
 379   for (const auto &output : node.getOutputs() | ir::Remove::UNDEFINED)
 380   {
 381     const auto &operand = _graph->operands().at(output);
 382     const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
 383     // average data transfer cost of this operand's data
 384     int64_t avg_transfer_cost = 1;
 385     for (const auto *backend : _all_backends)
 386     {
 387       for (const auto *other_backend : _all_backends)
 388       {
 389         if (backend == other_backend)
 390         {
 391           continue;
 392         }
 393         // TODO Change it to builtin backend
 394         auto transfer_cost =
 395           getPermuteTime(backend, other_backend, quant, operand.info().total_size());
 396         avg_transfer_cost += transfer_cost;
 397       }
 398     }
 399     avg_transfer_cost /= _all_backends.size();
 400     for (const auto &use : operand.getUses())
 401     {
 402       const auto cur_child_rank = DFSMaxRank(use);
 403       max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost);
 404     }
 405   }
 406   return max_child_rank;
 407 }
 408
 409 int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
 410                                           const int64_t &starting_time, const int64_t &time_amount)
 411 {
 412   const auto backend_times = _backends_avail_time.at(backend);
 413   // finishing and starting times of an op, that will come after current op
 414   auto next_op_fst = backend_times.upper_bound(starting_time);
 415   // finishing time of an op, that will come before current op
 416   auto prev_op_ft = starting_time;
 417   // until reach the "hole/gap", that is enough to run this op
 418   while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft <= time_amount)
 419   {
 420     prev_op_ft = next_op_fst->first + 1;
 421     ++next_op_fst;
 422   }
 423   return prev_op_ft;
 424 }
 425
 426 bool HEScheduler::schedule(const ir::OperationIndex &index, const backend::Backend *parent_backend)
 427 {
 428   VERBOSE(HEScheduler::schedule) << "scheduling (" << index << ")" << std::endl;
 429   int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
 430   const auto &node = _graph->operations().at(index);
 431
 432   std::multimap<int64_t, int64_t> selected_transfer_st_exec_time;
 433   // select the backend with the smallest eft of this task
 434   const backend::Backend *chosen_backend = nullptr;
 435   for (const auto *backend : _all_backends)
 436   {
 437     std::multimap<int64_t, int64_t> transfer_st_exec_time;
 438     const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time);
 439
 440     if (eft > est_and_et.first + est_and_et.second)
 441     {
 442       eft = est_and_et.first + est_and_et.second;
 443       selected_exec_time = est_and_et.second;
 444       chosen_backend = backend;
 445       selected_transfer_st_exec_time = transfer_st_exec_time;
 446     }
 447   }
 448
 449   if (chosen_backend == nullptr)
 450   {
 451     throw std::runtime_error{"Fail to choose backend on scheduler"};
 452   }
 453
 454   // this is part of a branch and it is assigned another backend
 455   if (parent_backend && parent_backend != chosen_backend)
 456   {
 457     return false;
 458   }
 459   for (const auto &it : selected_transfer_st_exec_time)
 460   {
 461     auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
 462     _backends_avail_time[_cpu_backend].insert({prev_op_ft + it.second, prev_op_ft});
 463   }
 464
 465   _ops_eft[index] = eft;
 466   _backends_avail_time[chosen_backend].emplace(eft, eft - selected_exec_time);
 467   _backend_resolver->setBackend(index, chosen_backend);
 468
 469   VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
 470                                  << chosen_backend->config()->id() << ". Its eft: " << eft
 471                                  << std::endl;
 472   return true;
 473 }
 474
 475 std::pair<int64_t, int64_t>
 476 HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::OperationIndex &index,
 477                             std::multimap<int64_t, int64_t> &transfer_st_exec_time)
 478 {
 479   // Permutation will cause creating a separate op_seq that contains just this permutation node.
 480   // This isn't needed for Linear executor since it doesn't use op_seqs
 481   // Number 1 ms is picked experimentally
 482   int64_t permute_fine = 1000;
 483   // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with
 484   // permutation on other branches or non-nnfw specific tasks and have to wait for it.
 485   // Number 2 is picked experimentally
 486   const int64_t CPU_DELAY = 2;
 487   const auto &node = _graph->operations().at(index);
 488   const bool quant = isQuant(*_graph, node);
 489   const auto size = getOperationsFlattenedIOSize(*_graph, node);
 490   // if this node can be part of a op_seq, then assigning different backend will cause creating
 491   // another op_seq
 492   if (isMergeable(*_graph, node))
 493   {
 494     permute_fine *= 2;
 495   }
 496   if (isWorkaroundSkip(*_graph, backend, node, quant))
 497   {
 498     return {_exec_time->getMax(), _exec_time->getMax()};
 499   }
 500   // get average exec time of the op on this backend
 501   auto exec_time = getOpTime(backend, node.name(), quant, size);
 502   if (backend->config()->id() == "cpu" && _is_parallel_exec)
 503   {
 504     exec_time *= CPU_DELAY;
 505   }
 506
 507   // get max eft of direct (one level above) predecessors
 508   auto max_pred_eft = predMaxEFT(backend, node, transfer_st_exec_time);
 509
 510   int64_t total_transfer_cost = 0;
 511   std::vector<std::multimap<int64_t, int64_t>::iterator> inserted_permutations;
 512   // Find free time for data transferring and insert it into backend taskset. This is needed:
 513   //  1. Time for multiple permutations for this node's input is found correctly
 514   //  2. If backend==cpu, then free time for this node must come after permutations
 515   for (auto &&it : transfer_st_exec_time)
 516   {
 517     if (_is_parallel_exec)
 518     {
 519       it.second *= CPU_DELAY;
 520     }
 521     if (!_is_linear_exec)
 522     {
 523       it.second += permute_fine;
 524     }
 525     total_transfer_cost += it.second;
 526
 527     const auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
 528
 529     max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second);
 530
 531     const auto tmp = _backends_avail_time[_cpu_backend].emplace(prev_op_ft + it.second, prev_op_ft);
 532     inserted_permutations.push_back(tmp.first);
 533   }
 534   // find the hole/gap, where this op can be put or the finishing time of the last assigned op
 535   auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
 536
 537   // Remove inserted permutation from cpu's task set
 538   for (const auto &it : inserted_permutations)
 539   {
 540     _backends_avail_time[_cpu_backend].erase(it);
 541   }
 542
 543   /* In case non-parallel executor measure just exec time and data transfer time
 544    * because EFT(prev_op_ft) is the same for all backends. Since two operations
 545    * can't be run simultaneously, finish of running operation must be waited for.
 546    * When an operation starts, all backends are free. So, they need time just for
 547    * data transfer.*/
 548   if (!_is_parallel_exec)
 549   {
 550     VERBOSE(HEScheduler::ESTAndExecTime)
 551       << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
 552       << backend->config()->id() << " is " << exec_time
 553       << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
 554
 555     return {total_transfer_cost, exec_time};
 556   }
 557   VERBOSE(HEScheduler::ESTAndExecTime)
 558     << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
 559     << backend->config()->id() << ": " << exec_time
 560     << " microseconds. Backend available time: " << prev_op_ft
 561     << " Parent's max eft: " << max_pred_eft - total_transfer_cost
 562     << " data transfer cost: " << total_transfer_cost << std::endl;
 563
 564   return {prev_op_ft, exec_time};
 565 }
 566
 567 int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::IOperation &node,
 568                                 std::multimap<int64_t, int64_t> &transfer_st_exec_time)
 569 {
 570   int64_t max_pred_eft = 0;
 571   for (const auto &input_operand_idx : node.getInputs() | ir::Remove::UNDEFINED)
 572   {
 573     const auto &input_operand = _graph->operands().at(input_operand_idx);
 574     const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
 575
 576     auto input_node_idx = input_operand.getDef();
 577     if (input_node_idx.valid())
 578     {
 579       // Data transfer cost from parent's node backend to current node's backend:
 580       auto parent_backend = _backend_resolver->getBackend(input_node_idx);
 581
 582       max_pred_eft = std::max(max_pred_eft, _ops_eft.at(input_node_idx));
 583       if (parent_backend != backend)
 584       {
 585         // Multiply operand size by 2 because size must describe input+output size
 586         int64_t transfer_cost =
 587           getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
 588         transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
 589       }
 590     }
 591   }
 592   return max_pred_eft;
 593 }
 594
 595 } // namespace compiler
 596
 597 } // namespace onert