runtime/onert/core/src/compiler/ExecutorFactory.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "ExecutorFactory.h"
  18
  19 #include "Linear.h"
  20 #include "../backend/builtin/BackendContext.h"
  21 #include "../backend/builtin/Config.h"
  22 #include "../backend/builtin/UserTensor.h"
  23 #include "../dumper/text/GraphDumper.h"
  24 #include "../exec/DataflowExecutor.h"
  25 #include "../exec/ExecTime.h"
  26 #include "../exec/ExecutionObservers.h"
  27 #include "../exec/LinearExecutor.h"
  28 #include "../exec/ParallelExecutor.h"
  29 #include "../ir/OperationCloner.h"
  30
  31 #include <backend/IPortableTensor.h>
  32 #include <compiler/BackendManager.h>
  33 #include <compiler/ExecutionBuilder.h>
  34 #include <util/TracingCtx.h>
  35
  36 #include <functional>
  37 #include <memory>
  38
  39 namespace onert
  40 {
  41 namespace
  42 {
  43
  44 class SyncFunction final : public exec::IFunction
  45 {
  46 public:
  47   virtual ~SyncFunction() = default;
  48   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
  49     : _fn{std::move(fn)}, _config{config}
  50   {
  51     assert(_fn);
  52     assert(_config);
  53   }
  54
  55   void run() override
  56   {
  57     _fn->run();
  58     _config->sync();
  59   }
  60
  61   void prepare() override { _fn->prepare(); }
  62
  63 private:
  64   std::unique_ptr<exec::IFunction> _fn;
  65   std::shared_ptr<backend::IConfig> _config;
  66 };
  67
  68 using DeallocList = std::vector<backend::ITensor *>;
  69 // Deallocation after execution of an operation used by Linear Executor
  70 class DeallocFunction final : public exec::IFunction
  71 {
  72 public:
  73   DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {}
  74
  75   void run() override
  76   {
  77     for (auto tensor : _dealloc_list)
  78     {
  79       if (!tensor->is_dynamic())
  80         continue;
  81       tensor->deallocBuffer();
  82     }
  83   }
  84
  85 private:
  86   DeallocList _dealloc_list;
  87 };
  88
  89 void initializeSubgraphIOTensors(compiler::LoweredGraph &lowered_graph,
  90                                  const backend::BackendContexts &backend_contexts,
  91                                  const ir::OperandIndexSequence &indices)
  92 {
  93   // TODO Store builtin backend in BackendContext
  94   std::shared_ptr<backend::builtin::TensorRegistry> builtin_tensor_reg;
  95   for (const auto &e : backend_contexts)
  96   {
  97     auto backend = e.first;
  98     auto &context = e.second;
  99     if (backend->config()->id() == backend::builtin::Config::ID)
 100     {
 101       builtin_tensor_reg =
 102         std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(context->tensor_registry);
 103     }
 104   }
 105   assert(builtin_tensor_reg);
 106
 107   for (auto ind : indices)
 108   {
 109     const auto &operand = lowered_graph.graph().operands().at(ind);
 110     auto tensor = std::make_unique<backend::builtin::IOTensor>(
 111       operand.info(),
 112       ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */
 113     );
 114
 115     // Add tensor to builtin TensorRegistry.
 116     builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
 117   }
 118 }
 119
 120 backend::BackendContexts createBackendContexts(compiler::LoweredGraph &lgraph, bool linear_executor)
 121 {
 122   backend::BackendContexts contexts;
 123   auto &backend_manager = compiler::BackendManager::get();
 124
 125   std::unordered_map<const backend::Backend *, backend::ContextData> context_data_map;
 126
 127   // Generate partial graphs for each backend
 128   for (auto backend : backend_manager.getAll())
 129   {
 130     auto &data = context_data_map[backend];
 131     auto graph = std::make_unique<ir::Graph>();
 132     graph->setLayout(lgraph.graph().layout());
 133     data.graph = std::move(graph);
 134   }
 135
 136   auto &whole_graph = lgraph.graph();
 137   // Separate operands into partial graphs
 138   whole_graph.operands().iterate([&](const ir::OperandIndex &operand_ind, ir::Operand &operand) {
 139     auto &operand_li = lgraph.lower_info().operand;
 140     const auto &def_factors = operand_li.at(operand_ind).def_factors();
 141     if (def_factors.size() == 0) // Ignore unused tensor
 142       return;
 143     const auto &def_factor = def_factors.getOnlyElement();
 144     const auto backend = def_factor.backend();
 145     auto &partial_graph = *context_data_map[backend].graph;
 146     auto &operand_layouts = context_data_map[backend].operand_layouts;
 147     assert(operand_layouts.find(operand_ind) == operand_layouts.end());
 148     operand_layouts[operand_ind] = def_factor.layout();
 149
 150     // Copy the operand and insert it to the partial graph
 151     auto new_operand = std::make_unique<ir::Operand>(operand);
 152     new_operand->clearDefUse();
 153     operand.releaseData(); // Deref data of LoweredGraph
 154     auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
 155     UNUSED_RELEASE(new_operand_ind);
 156     assert(new_operand_ind == operand_ind);
 157   });
 158   // Separate operations into partial graphs
 159   whole_graph.operations().iterate(
 160     [&](const ir::OperationIndex &op_ind, const ir::Operation &operation) {
 161       auto &op_li = lgraph.lower_info().operation;
 162       auto backend = op_li.at(op_ind).backend();
 163       auto &partial_graph = *context_data_map[backend].graph;
 164       auto &external_operands = context_data_map[backend].external_operands;
 165       auto &operand_layouts = context_data_map[backend].operand_layouts;
 166
 167       {
 168         // Add missing operands (externals)
 169         auto io_list = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
 170                        ir::Remove::UNDEFINED;
 171         for (auto operand_ind : io_list)
 172         {
 173           if (partial_graph.operands().exist(operand_ind))
 174             continue;
 175
 176           // Copy the operand and insert it to the partial graph
 177           const auto &operand = whole_graph.operands().at(operand_ind);
 178           auto new_operand = std::make_unique<ir::Operand>(operand);
 179           new_operand->clearDefUse();
 180           auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
 181           UNUSED_RELEASE(new_operand_ind);
 182           assert(new_operand_ind == operand_ind);
 183
 184           auto layout =
 185             lgraph.lower_info().operand.at(operand_ind).def_factors().getOnlyElement().layout();
 186           assert(operand_layouts.find(operand_ind) == operand_layouts.end());
 187           operand_layouts[operand_ind] = layout;
 188           external_operands.add(operand_ind);
 189         }
 190
 191         auto new_op_ind = partial_graph.addOperation(op_ind, clone(operation));
 192         UNUSED_RELEASE(new_op_ind);
 193         assert(new_op_ind == op_ind);
 194       }
 195     });
 196
 197   // Create contexts
 198   auto whole_op_order = lgraph.graph().topolSortOperations();
 199   for (auto &&pair : context_data_map)
 200   {
 201     auto backend = pair.first;
 202     auto &data = pair.second;
 203     // Handle graph input/outputs or external tensors
 204     data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
 205       if (whole_graph.getInputs().contains(ind) || whole_graph.getOutputs().contains(ind))
 206         data.external_operands.add(ind);
 207       // Inputs are either "graph input" or "no def op and non-constant"
 208       if (whole_graph.getInputs().contains(ind) ||
 209           (!operand.getDef().valid() && !operand.isConstant()))
 210         // Outputs are either "graph output" or "no uses"
 211         data.graph->addInput(ind);
 212       if (whole_graph.getOutputs().contains(ind) || operand.getUses().size() == 0)
 213         data.graph->addOutput(ind);
 214     });
 215     dumper::text::dumpGraph(*data.graph);
 216
 217     std::copy_if(whole_op_order.begin(), whole_op_order.end(), std::back_inserter(data.op_order),
 218                  [&](const auto &ind) { return data.graph->operations().exist(ind); });
 219     data.is_linear_executor = linear_executor;
 220     data.custom_kernel_builder = lgraph.graph().getKernelBuilder();
 221     contexts.emplace(backend, backend->newContext(std::move(data)));
 222   }
 223   return contexts;
 224 }
 225
 226 } // namespace
 227 } // namespace onert
 228
 229 namespace onert
 230 {
 231 namespace compiler
 232 {
 233
 234 ExecutorFactory &ExecutorFactory::get()
 235 {
 236   static ExecutorFactory singleton;
 237   return singleton;
 238 }
 239
 240 ExecutorFactory::ExecutorFactory()
 241 {
 242   _map["Linear"] = createLinearExecutor;
 243   _map["Dataflow"] =
 244     std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
 245               std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false);
 246   _map["Parallel"] =
 247     std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
 248               std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true);
 249 }
 250
 251 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
 252                                          const util::TracingCtx *tracing_ctx,
 253                                          const compiler::CompilerOptions &options,
 254                                          const std::shared_ptr<exec::IExecutors> &executors,
 255                                          const ir::ModelIndex &index)
 256 {
 257   return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors,
 258                                    index);
 259 }
 260
 261 void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
 262                                             const backend::BackendContexts &backend_contexts)
 263 {
 264   TensorRegistries tensor_regs{backend_contexts, true};
 265
 266   lowered_graph.graph().operations().iterate(
 267     [&](const ir::OperationIndex &op_ind, const ir::Operation &op) {
 268       auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind);
 269       auto &backend_ctx = backend_contexts.at(lower_info->backend());
 270       for (auto ind :
 271            (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
 272       {
 273         // If an Operation's input/output tensor does not have an own tensor object,
 274         // it must be using migrant tensors, so find the tensor from other tensor registries and
 275         // register it to the current tensor registry if it is portable
 276         if (!backend_ctx->tensor_registry->getITensor(ind))
 277         {
 278           auto tensor = tensor_regs.getITensor(ind);
 279           assert(tensor); // The tensor must have been registered
 280           auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
 281           if (ptensor)
 282             backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
 283         }
 284       }
 285     });
 286 }
 287
 288 void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs,
 289                                             const std::shared_ptr<exec::IExecutors> &executors,
 290                                             const backend::BackendContexts &backend_contexts,
 291                                             const ir::ModelIndex &index)
 292 {
 293   for (auto &&pair : backend_contexts)
 294   {
 295     auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
 296     if (builtin_context != nullptr)
 297     {
 298       auto builtin_kernel_gen = builtin_context->kernel_gen;
 299       builtin_kernel_gen->setTensorRegistries(tensor_regs);
 300       builtin_kernel_gen->setExecutors(executors);
 301       builtin_kernel_gen->setModelIndex(index);
 302     }
 303   }
 304 }
 305
 306 std::deque<std::pair<const backend::Backend *, backend::BackendContext *>>
 307 ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_contexts)
 308 {
 309   std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
 310
 311   for (auto &&pair : backend_contexts)
 312   {
 313     // NOTE builtin backend must be processed lastly.
 314     // This is because of Permute layer's specialty which is the only operation that could have
 315     // different ITensor objects for the input and the output. And it requires all other backends'
 316     // tensors are ready to use.
 317     if (pair.first->config()->id() == "builtin")
 318       ordered_contexts.emplace_back(pair.first, pair.second.get());
 319     else
 320       ordered_contexts.emplace_front(pair.first, pair.second.get());
 321   }
 322
 323   return ordered_contexts;
 324 }
 325
 326 exec::IExecutor *ExecutorFactory::createLinearExecutor(
 327   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
 328   const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
 329   const ir::ModelIndex &index)
 330 {
 331   auto &graph = lowered_graph->graph();
 332
 333   backend::BackendContexts backend_contexts =
 334     createBackendContexts(*lowered_graph, options.executor == "Linear");
 335
 336   TensorRegistries tensor_regs{backend_contexts, true};
 337
 338   initializeSubgraphIOTensors(
 339     *lowered_graph, backend_contexts,
 340     (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
 341       ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 342
 343   // linearize
 344   auto order = Linear::linearize(*lowered_graph);
 345   Linear::dump(*lowered_graph, order);
 346
 347   for (auto &&pair : backend_contexts)
 348   {
 349     pair.second->genTensors();
 350   }
 351
 352   prepareMigrantTensors(*lowered_graph, backend_contexts);
 353
 354   // Give some runtime objects to builtin KernelGenerator
 355   prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
 356
 357   ExecutionBuilder builder;
 358
 359   // Adjust the order of backends for the upcoming iteration
 360   auto ordered_contexts = orderBackendContext(backend_contexts);
 361
 362   // Simulate the execution for deallocation of tensors
 363   std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
 364   {
 365     ir::OperandIndexMap<uint32_t> uses_map;
 366     ir::OperandIndexSequence constants;
 367
 368     auto model_io =
 369       (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
 370
 371     // Prepare scanning
 372     graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
 373       uses_map[ind] = obj.getUses().size();
 374
 375       if (obj.isConstant())
 376         constants.append(ind);
 377     });
 378
 379     // A trick to consider constants as an execption
 380     for (const auto &ind : constants)
 381     {
 382       uses_map[ind]++;
 383     }
 384
 385     for (const auto op_ind : order)
 386     {
 387       const auto &op = graph.operations().at(op_ind);
 388       auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
 389       auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
 390
 391       for (const auto &ind : op_inputs)
 392       {
 393         const auto &operand = graph.operands().at(ind);
 394         assert(uses_map.find(ind) != uses_map.end());
 395         assert(uses_map[ind] > 0);
 396         uses_map[ind]--;
 397         if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
 398         {
 399           dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
 400         }
 401       }
 402     }
 403
 404     // Dispose and validate
 405     for (const auto &ind : constants)
 406     {
 407       --uses_map[ind];
 408     }
 409
 410     assert(
 411       std::all_of(uses_map.begin(), uses_map.end(),
 412                   [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
 413   }
 414
 415   // Generate kernels
 416   for (auto &&pair : ordered_contexts)
 417   {
 418     auto codes = pair.second->genKernels();
 419     for (auto &&pair : codes)
 420     {
 421       auto &op_ind = pair.first;
 422       auto &fn_seq = pair.second;
 423       auto &op = lowered_graph->graph().operations().at(op_ind);
 424       auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
 425       if (options.he_profiling_mode)
 426         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 427       if (!dealloc_list_map[op_ind].empty())
 428         fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind]));
 429       builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
 430     }
 431   }
 432
 433   auto code_map = builder.releaseCodeMap();
 434
 435   auto exec = new exec::LinearExecutor{std::move(lowered_graph),
 436                                        std::move(backend_contexts),
 437                                        tensor_regs,
 438                                        std::move(code_map),
 439                                        order,
 440                                        tracing_ctx};
 441
 442   if (!options.trace_filepath.empty())
 443   {
 444     std::unique_ptr<exec::IExecutionObserver> ctp =
 445       std::make_unique<exec::TracingObserver>(options.trace_filepath, exec->graph(), tracing_ctx);
 446     exec->addObserver(std::move(ctp));
 447   }
 448
 449   return exec;
 450 }
 451
 452 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
 453   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
 454   const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
 455   const ir::ModelIndex &index, bool parallel)
 456 {
 457   backend::BackendContexts backend_contexts =
 458     createBackendContexts(*lowered_graph, options.executor == "Linear");
 459
 460   TensorRegistries tensor_regs{backend_contexts, true};
 461
 462   initializeSubgraphIOTensors(
 463     *lowered_graph, backend_contexts,
 464     (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
 465       ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 466
 467   for (auto &&pair : backend_contexts)
 468   {
 469     pair.second->genTensors();
 470   }
 471
 472   prepareMigrantTensors(*lowered_graph, backend_contexts);
 473
 474   // Give some runtime objects to builtin KernelGenerator
 475   prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
 476
 477   ExecutionBuilder builder;
 478
 479   // Adjust the order of backends for the upcoming iteration
 480   auto ordered_contexts = orderBackendContext(backend_contexts);
 481
 482   // Generate kernels
 483   for (auto &&pair : ordered_contexts)
 484   {
 485     auto codes = pair.second->genKernels();
 486     for (auto &&pair : codes)
 487     {
 488       auto &op_ind = pair.first;
 489       auto &fn_seq = pair.second;
 490       auto &op = lowered_graph->graph().operations().at(op_ind);
 491       auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
 492       if (options.he_profiling_mode)
 493         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 494       builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
 495     }
 496   }
 497
 498   auto code_map = builder.releaseCodeMap();
 499
 500   exec::ExecutorBase *exec = nullptr;
 501   if (parallel)
 502   {
 503     exec = new exec::ParallelExecutor{std::move(lowered_graph), std::move(backend_contexts),
 504                                       tensor_regs, std::move(code_map), tracing_ctx};
 505   }
 506   else
 507   {
 508     auto dataflow_exec =
 509       new exec::DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs,
 510                                  std::move(code_map), tracing_ctx};
 511     if (options.he_profiling_mode)
 512     {
 513       std::vector<const backend::Backend *> backends;
 514       for (const auto &pair : backend_contexts)
 515       {
 516         backends.push_back(pair.first);
 517       }
 518       auto et = std::make_shared<exec::ExecTime>(backends);
 519       std::unique_ptr<exec::IExecutionObserver> obs =
 520         std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
 521       dataflow_exec->addObserver(std::move(obs));
 522     }
 523     exec = dataflow_exec;
 524   }
 525
 526   if (!options.trace_filepath.empty())
 527   {
 528     std::unique_ptr<exec::IExecutionObserver> ctp =
 529       std::make_unique<exec::TracingObserver>(options.trace_filepath, exec->graph(), tracing_ctx);
 530     exec->addObserver(std::move(ctp));
 531   }
 532
 533   return exec;
 534 }
 535
 536 } // namespace compiler
 537 } // namespace onert