runtime/onert/core/src/compiler/ExecutorFactory.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "ExecutorFactory.h"
  18
  19 #include <functional>
  20 #include "exec/ExecutionObservers.h"
  21 #include "exec/LinearExecutor.h"
  22 #include "exec/DataflowExecutor.h"
  23 #include "exec/ParallelExecutor.h"
  24 #include "compiler/BackendManager.h"
  25 #include "compiler/ExecutionBuilder.h"
  26 #include "exec/ExecTime.h"
  27 #include "compiler/Linear.h"
  28 #include "backend/IConstantInitializer.h"
  29 #include "backend/IKernelGenerator.h"
  30 #include "backend/IOptimizer.h"
  31 #include "backend/ITensorRegister.h"
  32 #include "backend/controlflow/Config.h"
  33 #include "backend/controlflow/KernelGenerator.h"
  34 #include "backend/controlflow/UserTensor.h"
  35 #include "backend/controlflow/TensorBuilder.h"
  36 #include <memory>
  37
  38 namespace onert
  39 {
  40 namespace
  41 {
  42
  43 class SyncFunction final : public exec::IFunction
  44 {
  45 public:
  46   virtual ~SyncFunction() = default;
  47   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
  48       : _fn{std::move(fn)}, _config{config}
  49   {
  50     assert(_fn);
  51     assert(_config);
  52   }
  53
  54   void run() override
  55   {
  56     _fn->run();
  57     _config->sync();
  58   }
  59
  60   void prepare() override { _fn->prepare(); }
  61
  62 private:
  63   std::unique_ptr<exec::IFunction> _fn;
  64   std::shared_ptr<backend::IConfig> _config;
  65 };
  66
  67 } // namespace
  68 } // namespace onert
  69
  70 namespace onert
  71 {
  72 namespace compiler
  73 {
  74
  75 ExecutorFactory &ExecutorFactory::get()
  76 {
  77   static ExecutorFactory singleton;
  78   return singleton;
  79 }
  80
  81 ExecutorFactory::ExecutorFactory()
  82 {
  83   _map["Linear"] = createLinearExecutor;
  84   _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
  85                                std::placeholders::_3, false);
  86   _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
  87                                std::placeholders::_3, true);
  88 }
  89
  90 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<ir::LoweredGraph> lowered_graph,
  91                                          const compiler::CompilerOptions &options,
  92                                          const std::shared_ptr<exec::ExecutorMap> &executor_map)
  93 {
  94   return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
  95 }
  96
  97 void ExecutorFactory::initializeBackendContext(ir::LoweredGraph *lowered_graph)
  98 {
  99   struct Entry
 100   {
 101     std::vector<backend::BackendContext::OperationInfo> operation_list;
 102     std::vector<ir::OperandIndex> operand_list;
 103   };
 104   std::unordered_map<const backend::Backend *, Entry> backend_assets;
 105
 106   // Build lists for operations
 107   lowered_graph->op_seqs().iterate(
 108       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
 109         auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
 110         auto backend = op_seq_li.at(op_seq_index)->backend();
 111         for (auto &operation_idx : op_seq.operations())
 112         {
 113           backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
 114         }
 115       });
 116
 117   // Build lists for operands
 118   lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
 119     const auto lower_info = lowered_graph->getLowerInfo(ind);
 120     for (auto factor : lower_info->def_factors())
 121     {
 122       auto backend = factor.backend();
 123       backend_assets[backend].operand_list.emplace_back(ind);
 124     }
 125   });
 126
 127   for (auto &pair : backend_assets)
 128   {
 129     auto backend = pair.first;
 130     auto &arg = pair.second;
 131     lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
 132   }
 133 }
 134
 135 void ExecutorFactory::runTensorRegistration(ir::LoweredGraph *lowered_graph,
 136                                             const std::vector<ir::OpSequenceIndex> &order)
 137 {
 138   for (const auto index : order)
 139   {
 140     const auto &op_seq = lowered_graph->op_seqs().at(index);
 141     const auto backend = lowered_graph->getLowerInfo(index)->backend();
 142     const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
 143     auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 144     if (tensor_register)
 145     {
 146       // Custom registration
 147       tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
 148     }
 149     else
 150     {
 151       // Default registration
 152       for (const auto op_idx : op_seq)
 153       {
 154         const auto &op = lowered_graph->graph().operations().at(op_idx);
 155         for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
 156         {
 157           if (!tensor_builder->isRegistered(index))
 158           {
 159             const auto &operand_lower_info =
 160                 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
 161
 162             // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
 163             // op.getOutputs() of permute (CPU) returns tensor A
 164             // but tensor A belongs to the backend of acl_cl.
 165             // So, we have to make this tensor NOT registered for CPU.
 166             if (operand_lower_info.backend() != backend)
 167               continue;
 168
 169             const auto &obj = lowered_graph->graph().operands().at(index);
 170             const auto frontend_layout = op_seq.getLayout();
 171             const auto backend_layout = operand_lower_info.layout();
 172             ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
 173                                          obj.typeInfo(), obj.info().memAllocType(),
 174                                          obj.isConstant()};
 175             tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
 176           }
 177         }
 178       }
 179     }
 180   }
 181 }
 182
 183 std::vector<std::shared_ptr<backend::ITensor>>
 184 ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
 185                                           const ir::OperandIndexSequence &indices)
 186 {
 187   std::vector<std::shared_ptr<backend::ITensor>> ret;
 188
 189   TensorBuilders tensor_builders{lowered_graph.backend_contexts(), false};
 190   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder =
 191       tensor_builders.getControlflowTensorBuilder();
 192   assert(cf_tensor_builder);
 193
 194   for (auto ind : indices)
 195   {
 196     const auto &operand = lowered_graph.graph().operands().at(ind);
 197     auto tensor = std::make_shared<backend::controlflow::UserTensor>(
 198         operand.info(),
 199         ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
 200         cf_tensor_builder->dynamicTensorManager());
 201
 202     // Add tensor to controlflow TensorRegistry.
 203     cf_tensor_builder->setUserTensor(ind, tensor);
 204     ret.push_back(tensor);
 205   }
 206   return ret;
 207 }
 208
 209 void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
 210                                              TensorBuilders &tensor_builders)
 211 {
 212   lowered_graph.op_seqs().iterate(
 213       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
 214         auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
 215         auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
 216         for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
 217                             ir::Remove::UNDEFINED)
 218         {
 219           // If an OpSequence input/output tensor does not have a own tensor object,
 220           // it must be using external tensors, so find the tensor from other tensor builders and
 221           // set the tensor to this tensor builder if portable
 222           if (!backend_ctx->tensor_builder->tensorAt(ind))
 223           {
 224             auto tensor = tensor_builders.getITensor(ind);
 225             assert(tensor); // The tensor must have been created in one of TensorBuilders
 226             auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
 227             if (ptensor)
 228               backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
 229           }
 230         }
 231       });
 232 }
 233
 234 exec::IExecutor *
 235 ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
 236                                       const compiler::CompilerOptions &options,
 237                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
 238 {
 239   const auto &backend_contexts = lowered_graph->backend_contexts();
 240
 241   initializeBackendContext(lowered_graph.get());
 242
 243   // linearize
 244   assert(!lowered_graph->graph().isBuildingPhase());
 245
 246   /*************************************************
 247    * Backend dependent analysis & optimization phase
 248    *************************************************/
 249
 250   for (auto &pair : backend_contexts)
 251   {
 252     auto &optimizer = pair.second->optimizer;
 253     if (optimizer)
 254       optimizer->optimize();
 255   }
 256
 257   /**********************************************************
 258    * Backend dependent analysis & optimization phase finished
 259    **********************************************************/
 260
 261   /***********************
 262    * Code generation phase
 263    ***********************/
 264
 265   auto order = Linear::linearize(*lowered_graph);
 266   runTensorRegistration(lowered_graph.get(), order);
 267
 268   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
 269   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
 270   if (options.is_primary_subgraph)
 271   {
 272     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
 273     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
 274   }
 275
 276   Linear::dump(*lowered_graph, order);
 277   Linear::planTensors(*lowered_graph, order);
 278
 279   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
 280
 281   for (auto &tensor_builder : tensor_builders)
 282   {
 283     tensor_builder->prepare();
 284   }
 285
 286   prepareExternalTensors(*lowered_graph, tensor_builders);
 287
 288   ExecutionBuilder builder;
 289
 290   // Generate kernels
 291   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
 292                                         const ir::OpSequence &op_seq) {
 293     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
 294     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
 295     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
 296     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
 297     if (cf_kernel_gen != nullptr)
 298     {
 299       cf_kernel_gen->setTensorBuilderSet(tensor_builders);
 300       cf_kernel_gen->setExecutorMap(executor_map);
 301     }
 302     auto fn_seq = kernel_gen->generate(op_seq);
 303     if (options.he_profiling_mode)
 304     {
 305       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 306     }
 307     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
 308   });
 309
 310   for (auto &tensor_builder : tensor_builders)
 311   {
 312     tensor_builder->allocate();
 313   }
 314
 315   for (auto &pair : backend_contexts)
 316   {
 317     pair.second->initConsts();
 318   }
 319
 320   lowered_graph->graph().operands().iterate(
 321       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 322
 323   auto code_map = builder.releaseCodeMap();
 324
 325   for (auto &it : code_map)
 326   {
 327     auto op_seq_index = it.first;
 328     auto &fn_seq = it.second.fn_seq;
 329
 330     fn_seq->iterate([&](exec::IFunction &ifunc) {
 331       ifunc.prepare();
 332       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
 333       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 334       tensor_builder->postFunctionPrepare();
 335     });
 336   }
 337
 338   auto exec =
 339       new exec::LinearExecutor{std::move(lowered_graph), input_tensors,       output_tensors,
 340                                tensor_builders,          std::move(code_map), order};
 341
 342   if (!options.trace_filepath.empty())
 343   {
 344     std::unique_ptr<exec::IExecutionObserver> ctp =
 345         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
 346     exec->addObserver(std::move(ctp));
 347   }
 348
 349   return exec;
 350 }
 351
 352 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
 353     std::unique_ptr<ir::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
 354     const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
 355 {
 356   const auto &backend_contexts = lowered_graph->backend_contexts();
 357
 358   initializeBackendContext(lowered_graph.get());
 359
 360   auto order = Linear::linearize(*lowered_graph);
 361   runTensorRegistration(lowered_graph.get(), order);
 362
 363   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
 364   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
 365   if (options.is_primary_subgraph)
 366   {
 367     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
 368     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
 369   }
 370
 371   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
 372
 373   // To make tensors never be deallocated, this is a workaround to use static memory planner
 374   for (auto &tensor_builder : tensor_builders)
 375   {
 376     lowered_graph->graph().operands().iterate(
 377         [&](const ir::OperandIndex &ind, const ir::Operand &) {
 378           if (tensor_builder->isRegistered(ind))
 379           {
 380             tensor_builder->notifyFirstUse(ind);
 381           }
 382         });
 383   }
 384
 385   for (auto &tensor_builder : tensor_builders)
 386   {
 387     tensor_builder->prepare();
 388   }
 389
 390   prepareExternalTensors(*lowered_graph, tensor_builders);
 391
 392   ExecutionBuilder builder;
 393
 394   // Generate kernels
 395   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
 396                                         const ir::OpSequence &op_seq) {
 397     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
 398     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
 399     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
 400     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
 401     if (cf_kernel_gen != nullptr)
 402     {
 403       assert(cf_kernel_gen != nullptr);
 404       cf_kernel_gen->setTensorBuilderSet(tensor_builders);
 405       cf_kernel_gen->setExecutorMap(executor_map);
 406     }
 407     auto fn_seq = kernel_gen->generate(op_seq);
 408     if (options.he_profiling_mode)
 409     {
 410       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
 411     }
 412     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
 413   });
 414
 415   for (const auto &tensor_builder : tensor_builders)
 416   {
 417     tensor_builder->allocate();
 418   }
 419
 420   for (auto &pair : backend_contexts)
 421   {
 422     pair.second->initConsts();
 423   }
 424
 425   lowered_graph->graph().operands().iterate(
 426       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 427
 428   auto code_map = builder.releaseCodeMap();
 429
 430   for (auto &it : code_map)
 431   {
 432     auto op_seq_index = it.first;
 433     auto &fn_seq = it.second.fn_seq;
 434
 435     fn_seq->iterate([&](exec::IFunction &ifunc) {
 436       ifunc.prepare();
 437       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
 438       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
 439       tensor_builder->postFunctionPrepare();
 440     });
 441   }
 442
 443   exec::ExecutorBase *exec = nullptr;
 444   if (parallel)
 445   {
 446     exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
 447                                       tensor_builders, std::move(code_map)};
 448   }
 449   else
 450   {
 451     auto dataflow_exec =
 452         new exec::DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors,
 453                                    tensor_builders, std::move(code_map)};
 454     if (options.he_profiling_mode)
 455     {
 456       std::vector<const backend::Backend *> backends;
 457       for (const auto &pair : backend_contexts)
 458       {
 459         backends.push_back(pair.first);
 460       }
 461       auto et = std::make_shared<exec::ExecTime>(backends);
 462       std::unique_ptr<exec::IExecutionObserver> obs =
 463           std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
 464       dataflow_exec->addObserver(std::move(obs));
 465     }
 466     exec = dataflow_exec;
 467   }
 468
 469   if (!options.trace_filepath.empty())
 470   {
 471     std::unique_ptr<exec::IExecutionObserver> ctp =
 472         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
 473     exec->addObserver(std::move(ctp));
 474   }
 475
 476   return exec;
 477 }
 478
 479 } // namespace compiler
 480 } // namespace onert