runtime/onert/core/src/compiler/ExecutorFactory.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "ExecutorFactory.h"
  18
  19 #include <functional>
  20 #include "exec/ExecutionObservers.h"
  21 #include "exec/LinearExecutor.h"
  22 #include "exec/DataflowExecutor.h"
  23 #include "exec/ParallelExecutor.h"
  24 #include "compiler/BackendManager.h"
  25 #include "compiler/ExecutionBuilder.h"
  26 #include "exec/ExecTime.h"
  27 #include "compiler/Linear.h"
  28 #include "compiler/TensorBuilders.h"
  29 #include "backend/IConstantInitializer.h"
  30 #include "backend/IKernelGenerator.h"
  31 #include "backend/IOptimizer.h"
  32 #include "backend/ITensorRegister.h"
  33 #include "backend/controlflow/Config.h"
  34 #include "backend/controlflow/KernelGenerator.h"
  35 #include "backend/controlflow/UserTensor.h"
  36 #include "backend/controlflow/TensorBuilder.h"
  37 #include <memory>
  38
  39 namespace onert
  40 {
  41 namespace
  42 {
  43
  44 class SyncFunction final : public exec::IFunction
  45 {
  46 public:
  47   virtual ~SyncFunction() = default;
  48   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
  49       : _fn{std::move(fn)}, _config{config}
  50   {
  51     assert(_fn);
  52     assert(_config);
  53   }
  54
  55   void run() override
  56   {
  57     _fn->run();
  58     _config->sync();
  59   }
  60
  61   void prepare() override { _fn->prepare(); }
  62
  63 private:
  64   std::unique_ptr<exec::IFunction> _fn;
  65   std::shared_ptr<backend::IConfig> _config;
  66 };
  67
  68 // TODO Think of a better way to manage TensorManagers
  69 backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
  70 {
  71   backend::TensorManagerSet tensor_mgrs;
  72   for (auto &tensor_builder : tensor_builders)
  73   {
  74     auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
  75     if (s_tensor_manager != nullptr)
  76       tensor_mgrs.insert(std::move(s_tensor_manager));
  77
  78     auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
  79     if (d_tensor_manager != nullptr)
  80       tensor_mgrs.insert(std::move(d_tensor_manager));
  81   }
  82   return tensor_mgrs;
  83 }
  84
  85 } // namespace
  86 } // namespace onert
  87
  88 namespace onert
  89 {
  90 namespace compiler
  91 {
  92
  93 ExecutorFactory &ExecutorFactory::get()
  94 {
  95   static ExecutorFactory singleton;
  96   return singleton;
  97 }
  98
  99 ExecutorFactory::ExecutorFactory()
 100 {
 101   _map["Linear"] = createLinearExecutor;
 102   _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
 103                                std::placeholders::_3, false);
 104   _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
 105                                std::placeholders::_3, true);
 106 }
 107
 108 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
 109                                          const compiler::CompilerOptions &options,
 110                                          const std::shared_ptr<exec::ExecutorMap> &executor_map)
 111 {
 112   return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
 113 }
 114
 115 void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph)
 116 {
 117   struct Entry
 118   {
 119     std::vector<backend::BackendContext::OperationInfo> operation_list;
 120     std::vector<ir::OperandIndex> operand_list;
 121   };
 122   std::unordered_map<const backend::Backend *, Entry> backend_assets;
 123
 124   // Build lists for operations
 125   lowered_graph->op_seqs().iterate(
 126       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
 127         auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
 128         auto backend = op_seq_li.at(op_seq_index)->backend();
 129         for (auto &operation_idx : op_seq.operations())
 130         {
 131           backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
 132         }
 133       });
 134
 135   // Build lists for operands
 136   lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
 137     const auto lower_info = lowered_graph->getLowerInfo(ind);
 138     for (auto factor : lower_info->def_factors())
 139     {
 140       auto backend = factor.backend();
 141       backend_assets[backend].operand_list.emplace_back(ind);
 142     }
 143   });
 144
 145   for (auto &pair : backend_assets)
 146   {
 147     auto backend = pair.first;
 148     auto &arg = pair.second;
 149     lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
 150   }
 151 }
 152
 153 void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph,
 154                                             const std::vector<ir::OpSequenceIndex> &order)
 155 {
 156   for (const auto index : order)
 157   {
 158     const auto &op_seq = lowered_graph->op_seqs().at(index);
 159     const auto backend = lowered_graph->getLowerInfo(index)->backend();
 160     const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
 161     auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 162     auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs();
 163
 164     if (tensor_register)
 165     {
 166       // Custom registration
 167       tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
 168     }
 169     else
 170     {
 171       // Default registration
 172       for (const auto op_idx : op_seq)
 173       {
 174         const auto &op = lowered_graph->graph().operations().at(op_idx);
 175         for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
 176         {
 177           if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
 178           {
 179             const auto &operand_lower_info =
 180                 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
 181
 182             // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
 183             // op.getOutputs() of permute (CPU) returns tensor A
 184             // but tensor A belongs to the backend of acl_cl.
 185             // So, we have to make this tensor NOT registered for CPU.
 186             if (operand_lower_info.backend() != backend)
 187               continue;
 188
 189             const auto &obj = lowered_graph->graph().operands().at(index);
 190             const auto frontend_layout = op_seq.getLayout();
 191             const auto backend_layout = operand_lower_info.layout();
 192             ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
 193                                          obj.typeInfo(), obj.info().memAllocType(),
 194                                          obj.isConstant()};
 195             tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
 196           }
 197         }
 198       }
 199     }
 200   }
 201 }
 202
 203 std::vector<std::shared_ptr<backend::ITensor>>
 204 ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
 205                                           const ir::OperandIndexSequence &indices)
 206 {
 207   std::vector<std::shared_ptr<backend::ITensor>> ret;
 208
 209   // TODO Store controlflow backend in BackendContext
 210   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
 211   std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
 212   for (const auto &e : lowered_graph.backend_contexts())
 213   {
 214     auto backend = e.first;
 215     auto &context = e.second;
 216     if (backend->config()->id() == backend::controlflow::Config::ID)
 217     {
 218       cf_tensor_builder =
 219           std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder);
 220       cf_tensor_reg =
 221           std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
 222     }
 223   }
 224   assert(cf_tensor_builder);
 225   assert(cf_tensor_reg);
 226
 227   for (auto ind : indices)
 228   {
 229     const auto &operand = lowered_graph.graph().operands().at(ind);
 230     auto tensor = std::make_shared<backend::controlflow::UserTensor>(
 231         operand.info(),
 232         ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
 233         cf_tensor_builder->dynamicTensorManager());
 234
 235     // Add tensor to controlflow TensorRegistry.
 236     cf_tensor_reg->setNativeUserTensor(ind, tensor);
 237     ret.push_back(tensor);
 238   }
 239   return ret;
 240 }
 241
 242 void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
 243 {
 244   TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
 245
 246   lowered_graph.op_seqs().iterate(
 247       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
 248         auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
 249         auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
 250         for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
 251                             ir::Remove::UNDEFINED)
 252         {
 253           // If an OpSequence input/output tensor does not have a own tensor object,
 254           // it must be using external tensors, so find the tensor from other tensor builders and
 255           // set the tensor to this tensor builder if portable
 256           if (!backend_ctx->tensor_registry->getITensor(ind))
 257           {
 258             auto tensor = tensor_regs.getITensor(ind);
 259             assert(tensor); // The tensor must have been registered
 260             auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
 261             if (ptensor)
 262               backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
 263           }
 264         }
 265       });
 266 }
 267
 268 exec::IExecutor *
 269 ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
 270                                       const compiler::CompilerOptions &options,
 271                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
 272 {
 273   const auto &backend_contexts = lowered_graph->backend_contexts();
 274
 275   initializeBackendContext(lowered_graph.get());
 276
 277   // linearize
 278   assert(!lowered_graph->graph().isBuildingPhase());
 279
 280   /*************************************************
 281    * Backend dependent analysis & optimization phase
 282    *************************************************/
 283
 284   for (auto &pair : backend_contexts)
 285   {
 286     auto &optimizer = pair.second->optimizer;
 287     if (optimizer)
 288       optimizer->optimize();
 289   }
 290
 291   /**********************************************************
 292    * Backend dependent analysis & optimization phase finished
 293    **********************************************************/
 294
 295   /***********************
 296    * Code generation phase
 297    ***********************/
 298
 299   auto order = Linear::linearize(*lowered_graph);
 300   runTensorRegistration(lowered_graph.get(), order);
 301
 302   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
 303   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
 304   if (options.is_primary_subgraph)
 305   {
 306     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
 307     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
 308   }
 309
 310   Linear::dump(*lowered_graph, order);
 311   Linear::planTensors(*lowered_graph, order);
 312
 313   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
 314   TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
 315
 316   for (auto &tensor_builder : tensor_builders)
 317   {
 318     tensor_builder->prepare();
 319   }
 320
 321   prepareExternalTensors(*lowered_graph);
 322
 323   ExecutionBuilder builder;
 324
 325   // Generate kernels
 326   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
 327                                         const ir::OpSequence &op_seq) {
 328     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
 329     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
 330     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
 331     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
 332     if (cf_kernel_gen != nullptr)
 333     {
 334       cf_kernel_gen->setTensorRegistries(tensor_regs);
 335       cf_kernel_gen->setExecutorMap(executor_map);
 336     }
 337     auto fn_seq = kernel_gen->generate(op_seq);
 338     if (options.he_profiling_mode)
 339     {
 340       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 341     }
 342     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
 343   });
 344
 345   for (auto &tensor_builder : tensor_builders)
 346   {
 347     tensor_builder->allocate();
 348   }
 349
 350   for (auto &pair : backend_contexts)
 351   {
 352     pair.second->initConsts();
 353   }
 354
 355   lowered_graph->graph().operands().iterate(
 356       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 357
 358   auto code_map = builder.releaseCodeMap();
 359
 360   for (auto &it : code_map)
 361   {
 362     auto op_seq_index = it.first;
 363     auto &fn_seq = it.second.fn_seq;
 364
 365     fn_seq->iterate([&](exec::IFunction &ifunc) {
 366       ifunc.prepare();
 367       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
 368       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 369       tensor_builder->postFunctionPrepare();
 370     });
 371   }
 372
 373   backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
 374   auto exec = new exec::LinearExecutor{
 375       std::move(lowered_graph), input_tensors,       output_tensors, tensor_regs,
 376       std::move(tensor_mgrs),   std::move(code_map), order};
 377
 378   if (!options.trace_filepath.empty())
 379   {
 380     std::unique_ptr<exec::IExecutionObserver> ctp =
 381         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
 382     exec->addObserver(std::move(ctp));
 383   }
 384
 385   return exec;
 386 }
 387
 388 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
 389     std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
 390     const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
 391 {
 392   const auto &backend_contexts = lowered_graph->backend_contexts();
 393
 394   initializeBackendContext(lowered_graph.get());
 395
 396   auto order = Linear::linearize(*lowered_graph);
 397   runTensorRegistration(lowered_graph.get(), order);
 398
 399   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
 400   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
 401   if (options.is_primary_subgraph)
 402   {
 403     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
 404     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
 405   }
 406
 407   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
 408   TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
 409
 410   // To make tensors never be deallocated, this is a workaround to use static memory planner
 411   for (auto &tensor_builder : tensor_builders)
 412   {
 413     lowered_graph->graph().operands().iterate(
 414         [&](const ir::OperandIndex &ind, const ir::Operand &) {
 415           if (tensor_builder->isRegistered(ind))
 416           {
 417             tensor_builder->notifyFirstUse(ind);
 418           }
 419         });
 420   }
 421
 422   for (auto &tensor_builder : tensor_builders)
 423   {
 424     tensor_builder->prepare();
 425   }
 426
 427   prepareExternalTensors(*lowered_graph);
 428
 429   ExecutionBuilder builder;
 430
 431   // Generate kernels
 432   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
 433                                         const ir::OpSequence &op_seq) {
 434     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
 435     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
 436     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
 437     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
 438     if (cf_kernel_gen != nullptr)
 439     {
 440       assert(cf_kernel_gen != nullptr);
 441       cf_kernel_gen->setTensorRegistries(tensor_regs);
 442       cf_kernel_gen->setExecutorMap(executor_map);
 443     }
 444     auto fn_seq = kernel_gen->generate(op_seq);
 445     if (options.he_profiling_mode)
 446     {
 447       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 448     }
 449     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
 450   });
 451
 452   for (const auto &tensor_builder : tensor_builders)
 453   {
 454     tensor_builder->allocate();
 455   }
 456
 457   for (auto &pair : backend_contexts)
 458   {
 459     pair.second->initConsts();
 460   }
 461
 462   lowered_graph->graph().operands().iterate(
 463       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 464
 465   auto code_map = builder.releaseCodeMap();
 466
 467   for (auto &it : code_map)
 468   {
 469     auto op_seq_index = it.first;
 470     auto &fn_seq = it.second.fn_seq;
 471
 472     fn_seq->iterate([&](exec::IFunction &ifunc) {
 473       ifunc.prepare();
 474       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
 475       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 476       tensor_builder->postFunctionPrepare();
 477     });
 478   }
 479
 480   backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
 481
 482   exec::ExecutorBase *exec = nullptr;
 483   if (parallel)
 484   {
 485     exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
 486                                       output_tensors,           tensor_regs,
 487                                       std::move(tensor_mgrs),   std::move(code_map)};
 488   }
 489   else
 490   {
 491     auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
 492                                                     output_tensors,           tensor_regs,
 493                                                     std::move(tensor_mgrs),   std::move(code_map)};
 494     if (options.he_profiling_mode)
 495     {
 496       std::vector<const backend::Backend *> backends;
 497       for (const auto &pair : backend_contexts)
 498       {
 499         backends.push_back(pair.first);
 500       }
 501       auto et = std::make_shared<exec::ExecTime>(backends);
 502       std::unique_ptr<exec::IExecutionObserver> obs =
 503           std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
 504       dataflow_exec->addObserver(std::move(obs));
 505     }
 506     exec = dataflow_exec;
 507   }
 508
 509   if (!options.trace_filepath.empty())
 510   {
 511     std::unique_ptr<exec::IExecutionObserver> ctp =
 512         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
 513     exec->addObserver(std::move(ctp));
 514   }
 515
 516   return exec;
 517 }
 518
 519 } // namespace compiler
 520 } // namespace onert