Imported Upstream version 1.9.0
[platform/core/ml/nnfw.git] / runtime / onert / core / src / compiler / ExecutorFactory.cc
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "ExecutorFactory.h"
18
19 #include <functional>
20 #include "exec/ExecutionObservers.h"
21 #include "exec/LinearExecutor.h"
22 #include "exec/DataflowExecutor.h"
23 #include "exec/ParallelExecutor.h"
24 #include "compiler/BackendManager.h"
25 #include "compiler/ExecutionBuilder.h"
26 #include "exec/ExecTime.h"
27 #include "compiler/Linear.h"
28 #include "compiler/TensorBuilders.h"
29 #include "backend/IConstantInitializer.h"
30 #include "backend/IKernelGenerator.h"
31 #include "backend/IOptimizer.h"
32 #include "backend/ITensorRegister.h"
33 #include "backend/controlflow/Config.h"
34 #include "backend/controlflow/KernelGenerator.h"
35 #include "backend/controlflow/UserTensor.h"
36 #include "backend/controlflow/TensorBuilder.h"
37 #include <memory>
38
39 namespace onert
40 {
41 namespace
42 {
43
44 class SyncFunction final : public exec::IFunction
45 {
46 public:
47   virtual ~SyncFunction() = default;
48   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
49       : _fn{std::move(fn)}, _config{config}
50   {
51     assert(_fn);
52     assert(_config);
53   }
54
55   void run() override
56   {
57     _fn->run();
58     _config->sync();
59   }
60
61   void prepare() override { _fn->prepare(); }
62
63 private:
64   std::unique_ptr<exec::IFunction> _fn;
65   std::shared_ptr<backend::IConfig> _config;
66 };
67
68 // TODO Think of a better way to manage TensorManagers
69 backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
70 {
71   backend::TensorManagerSet tensor_mgrs;
72   for (auto &tensor_builder : tensor_builders)
73   {
74     auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
75     if (s_tensor_manager != nullptr)
76       tensor_mgrs.insert(std::move(s_tensor_manager));
77
78     auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
79     if (d_tensor_manager != nullptr)
80       tensor_mgrs.insert(std::move(d_tensor_manager));
81   }
82   return tensor_mgrs;
83 }
84
85 } // namespace
86 } // namespace onert
87
88 namespace onert
89 {
90 namespace compiler
91 {
92
93 ExecutorFactory &ExecutorFactory::get()
94 {
95   static ExecutorFactory singleton;
96   return singleton;
97 }
98
99 ExecutorFactory::ExecutorFactory()
100 {
101   _map["Linear"] = createLinearExecutor;
102   _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
103                                std::placeholders::_3, false);
104   _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
105                                std::placeholders::_3, true);
106 }
107
108 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
109                                          const compiler::CompilerOptions &options,
110                                          const std::shared_ptr<exec::ExecutorMap> &executor_map)
111 {
112   return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
113 }
114
115 void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph)
116 {
117   struct Entry
118   {
119     std::vector<backend::BackendContext::OperationInfo> operation_list;
120     std::vector<ir::OperandIndex> operand_list;
121   };
122   std::unordered_map<const backend::Backend *, Entry> backend_assets;
123
124   // Build lists for operations
125   lowered_graph->op_seqs().iterate(
126       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
127         auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
128         auto backend = op_seq_li.at(op_seq_index)->backend();
129         for (auto &operation_idx : op_seq.operations())
130         {
131           backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
132         }
133       });
134
135   // Build lists for operands
136   lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
137     const auto lower_info = lowered_graph->getLowerInfo(ind);
138     for (auto factor : lower_info->def_factors())
139     {
140       auto backend = factor.backend();
141       backend_assets[backend].operand_list.emplace_back(ind);
142     }
143   });
144
145   for (auto &pair : backend_assets)
146   {
147     auto backend = pair.first;
148     auto &arg = pair.second;
149     lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
150   }
151 }
152
153 void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph,
154                                             const std::vector<ir::OpSequenceIndex> &order)
155 {
156   for (const auto index : order)
157   {
158     const auto &op_seq = lowered_graph->op_seqs().at(index);
159     const auto backend = lowered_graph->getLowerInfo(index)->backend();
160     const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
161     auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
162     auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs();
163
164     if (tensor_register)
165     {
166       // Custom registration
167       tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
168     }
169     else
170     {
171       // Default registration
172       for (const auto op_idx : op_seq)
173       {
174         const auto &op = lowered_graph->graph().operations().at(op_idx);
175         for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
176         {
177           if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
178           {
179             const auto &operand_lower_info =
180                 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
181
182             // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
183             // op.getOutputs() of permute (CPU) returns tensor A
184             // but tensor A belongs to the backend of acl_cl.
185             // So, we have to make this tensor NOT registered for CPU.
186             if (operand_lower_info.backend() != backend)
187               continue;
188
189             const auto &obj = lowered_graph->graph().operands().at(index);
190             const auto frontend_layout = op_seq.getLayout();
191             const auto backend_layout = operand_lower_info.layout();
192             ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
193                                          obj.typeInfo(), obj.info().memAllocType(),
194                                          obj.isConstant()};
195             tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
196           }
197         }
198       }
199     }
200   }
201 }
202
203 std::vector<std::shared_ptr<backend::ITensor>>
204 ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
205                                           const ir::OperandIndexSequence &indices)
206 {
207   std::vector<std::shared_ptr<backend::ITensor>> ret;
208
209   // TODO Store controlflow backend in BackendContext
210   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
211   std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
212   for (const auto &e : lowered_graph.backend_contexts())
213   {
214     auto backend = e.first;
215     auto &context = e.second;
216     if (backend->config()->id() == backend::controlflow::Config::ID)
217     {
218       cf_tensor_builder =
219           std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder);
220       cf_tensor_reg =
221           std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
222     }
223   }
224   assert(cf_tensor_builder);
225   assert(cf_tensor_reg);
226
227   for (auto ind : indices)
228   {
229     const auto &operand = lowered_graph.graph().operands().at(ind);
230     auto tensor = std::make_shared<backend::controlflow::UserTensor>(
231         operand.info(),
232         ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
233         cf_tensor_builder->dynamicTensorManager());
234
235     // Add tensor to controlflow TensorRegistry.
236     cf_tensor_reg->setNativeUserTensor(ind, tensor);
237     ret.push_back(tensor);
238   }
239   return ret;
240 }
241
242 void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
243 {
244   TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
245
246   lowered_graph.op_seqs().iterate(
247       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
248         auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
249         auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
250         for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
251                             ir::Remove::UNDEFINED)
252         {
253           // If an OpSequence input/output tensor does not have a own tensor object,
254           // it must be using external tensors, so find the tensor from other tensor builders and
255           // set the tensor to this tensor builder if portable
256           if (!backend_ctx->tensor_registry->getITensor(ind))
257           {
258             auto tensor = tensor_regs.getITensor(ind);
259             assert(tensor); // The tensor must have been registered
260             auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
261             if (ptensor)
262               backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
263           }
264         }
265       });
266 }
267
268 exec::IExecutor *
269 ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
270                                       const compiler::CompilerOptions &options,
271                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
272 {
273   const auto &backend_contexts = lowered_graph->backend_contexts();
274
275   initializeBackendContext(lowered_graph.get());
276
277   // linearize
278   assert(!lowered_graph->graph().isBuildingPhase());
279
280   /*************************************************
281    * Backend dependent analysis & optimization phase
282    *************************************************/
283
284   for (auto &pair : backend_contexts)
285   {
286     auto &optimizer = pair.second->optimizer;
287     if (optimizer)
288       optimizer->optimize();
289   }
290
291   /**********************************************************
292    * Backend dependent analysis & optimization phase finished
293    **********************************************************/
294
295   /***********************
296    * Code generation phase
297    ***********************/
298
299   auto order = Linear::linearize(*lowered_graph);
300   runTensorRegistration(lowered_graph.get(), order);
301
302   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
303   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
304   if (options.is_primary_subgraph)
305   {
306     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
307     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
308   }
309
310   Linear::dump(*lowered_graph, order);
311   Linear::planTensors(*lowered_graph, order);
312
313   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
314   TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
315
316   for (auto &tensor_builder : tensor_builders)
317   {
318     tensor_builder->prepare();
319   }
320
321   prepareExternalTensors(*lowered_graph);
322
323   ExecutionBuilder builder;
324
325   // Generate kernels
326   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
327                                         const ir::OpSequence &op_seq) {
328     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
329     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
330     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
331     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
332     if (cf_kernel_gen != nullptr)
333     {
334       cf_kernel_gen->setTensorRegistries(tensor_regs);
335       cf_kernel_gen->setExecutorMap(executor_map);
336     }
337     auto fn_seq = kernel_gen->generate(op_seq);
338     if (options.he_profiling_mode)
339     {
340       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
341     }
342     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
343   });
344
345   for (auto &tensor_builder : tensor_builders)
346   {
347     tensor_builder->allocate();
348   }
349
350   for (auto &pair : backend_contexts)
351   {
352     pair.second->initConsts();
353   }
354
355   lowered_graph->graph().operands().iterate(
356       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
357
358   auto code_map = builder.releaseCodeMap();
359
360   for (auto &it : code_map)
361   {
362     auto op_seq_index = it.first;
363     auto &fn_seq = it.second.fn_seq;
364
365     fn_seq->iterate([&](exec::IFunction &ifunc) {
366       ifunc.prepare();
367       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
368       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
369       tensor_builder->postFunctionPrepare();
370     });
371   }
372
373   backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
374   auto exec = new exec::LinearExecutor{
375       std::move(lowered_graph), input_tensors,       output_tensors, tensor_regs,
376       std::move(tensor_mgrs),   std::move(code_map), order};
377
378   if (!options.trace_filepath.empty())
379   {
380     std::unique_ptr<exec::IExecutionObserver> ctp =
381         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
382     exec->addObserver(std::move(ctp));
383   }
384
385   return exec;
386 }
387
388 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
389     std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
390     const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
391 {
392   const auto &backend_contexts = lowered_graph->backend_contexts();
393
394   initializeBackendContext(lowered_graph.get());
395
396   auto order = Linear::linearize(*lowered_graph);
397   runTensorRegistration(lowered_graph.get(), order);
398
399   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
400   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
401   if (options.is_primary_subgraph)
402   {
403     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
404     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
405   }
406
407   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
408   TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
409
410   // To make tensors never be deallocated, this is a workaround to use static memory planner
411   for (auto &tensor_builder : tensor_builders)
412   {
413     lowered_graph->graph().operands().iterate(
414         [&](const ir::OperandIndex &ind, const ir::Operand &) {
415           if (tensor_builder->isRegistered(ind))
416           {
417             tensor_builder->notifyFirstUse(ind);
418           }
419         });
420   }
421
422   for (auto &tensor_builder : tensor_builders)
423   {
424     tensor_builder->prepare();
425   }
426
427   prepareExternalTensors(*lowered_graph);
428
429   ExecutionBuilder builder;
430
431   // Generate kernels
432   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
433                                         const ir::OpSequence &op_seq) {
434     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
435     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
436     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
437     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
438     if (cf_kernel_gen != nullptr)
439     {
440       assert(cf_kernel_gen != nullptr);
441       cf_kernel_gen->setTensorRegistries(tensor_regs);
442       cf_kernel_gen->setExecutorMap(executor_map);
443     }
444     auto fn_seq = kernel_gen->generate(op_seq);
445     if (options.he_profiling_mode)
446     {
447       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
448     }
449     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
450   });
451
452   for (const auto &tensor_builder : tensor_builders)
453   {
454     tensor_builder->allocate();
455   }
456
457   for (auto &pair : backend_contexts)
458   {
459     pair.second->initConsts();
460   }
461
462   lowered_graph->graph().operands().iterate(
463       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
464
465   auto code_map = builder.releaseCodeMap();
466
467   for (auto &it : code_map)
468   {
469     auto op_seq_index = it.first;
470     auto &fn_seq = it.second.fn_seq;
471
472     fn_seq->iterate([&](exec::IFunction &ifunc) {
473       ifunc.prepare();
474       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
475       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
476       tensor_builder->postFunctionPrepare();
477     });
478   }
479
480   backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
481
482   exec::ExecutorBase *exec = nullptr;
483   if (parallel)
484   {
485     exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
486                                       output_tensors,           tensor_regs,
487                                       std::move(tensor_mgrs),   std::move(code_map)};
488   }
489   else
490   {
491     auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
492                                                     output_tensors,           tensor_regs,
493                                                     std::move(tensor_mgrs),   std::move(code_map)};
494     if (options.he_profiling_mode)
495     {
496       std::vector<const backend::Backend *> backends;
497       for (const auto &pair : backend_contexts)
498       {
499         backends.push_back(pair.first);
500       }
501       auto et = std::make_shared<exec::ExecTime>(backends);
502       std::unique_ptr<exec::IExecutionObserver> obs =
503           std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
504       dataflow_exec->addObserver(std::move(obs));
505     }
506     exec = dataflow_exec;
507   }
508
509   if (!options.trace_filepath.empty())
510   {
511     std::unique_ptr<exec::IExecutionObserver> ctp =
512         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
513     exec->addObserver(std::move(ctp));
514   }
515
516   return exec;
517 }
518
519 } // namespace compiler
520 } // namespace onert