b09d6b021593062fef2b79748de84adefaa534c7
[platform/core/ml/nnfw.git] / runtime / onert / core / src / compiler / ExecutorFactory.cc
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "ExecutorFactory.h"
18
19 #include "Linear.h"
20 #include "../backend/builtin/BackendContext.h"
21 #include "../backend/builtin/Config.h"
22 #include "../backend/builtin/UserTensor.h"
23 #include "../dumper/text/GraphDumper.h"
24 #include "../exec/DataflowExecutor.h"
25 #include "../exec/ExecTime.h"
26 #include "../exec/ExecutionObservers.h"
27 #include "../exec/LinearExecutor.h"
28 #include "../exec/ParallelExecutor.h"
29 #include "../ir/OperationCloner.h"
30
31 #include <backend/IPortableTensor.h>
32 #include <compiler/BackendManager.h>
33 #include <compiler/ExecutionBuilder.h>
34 #include <util/TracingCtx.h>
35
36 #include <functional>
37 #include <memory>
38
39 namespace onert
40 {
41 namespace
42 {
43
44 class SyncFunction final : public exec::IFunction
45 {
46 public:
47   virtual ~SyncFunction() = default;
48   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
49     : _fn{std::move(fn)}, _config{config}
50   {
51     assert(_fn);
52     assert(_config);
53   }
54
55   void run() override
56   {
57     _fn->run();
58     _config->sync();
59   }
60
61   void prepare() override { _fn->prepare(); }
62
63 private:
64   std::unique_ptr<exec::IFunction> _fn;
65   std::shared_ptr<backend::IConfig> _config;
66 };
67
68 using DeallocList = std::vector<backend::ITensor *>;
69 // Deallocation after execution of an operation used by Linear Executor
70 class DeallocFunction final : public exec::IFunction
71 {
72 public:
73   DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {}
74
75   void run() override
76   {
77     for (auto tensor : _dealloc_list)
78     {
79       if (!tensor->is_dynamic())
80         continue;
81       tensor->deallocBuffer();
82     }
83   }
84
85 private:
86   DeallocList _dealloc_list;
87 };
88
89 void initializeSubgraphIOTensors(compiler::LoweredGraph &lowered_graph,
90                                  const backend::BackendContexts &backend_contexts,
91                                  const ir::OperandIndexSequence &indices)
92 {
93   // TODO Store builtin backend in BackendContext
94   std::shared_ptr<backend::builtin::TensorRegistry> builtin_tensor_reg;
95   for (const auto &e : backend_contexts)
96   {
97     auto backend = e.first;
98     auto &context = e.second;
99     if (backend->config()->id() == backend::builtin::Config::ID)
100     {
101       builtin_tensor_reg =
102         std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(context->tensor_registry);
103     }
104   }
105   assert(builtin_tensor_reg);
106
107   for (auto ind : indices)
108   {
109     const auto &operand = lowered_graph.graph().operands().at(ind);
110     auto tensor = std::make_unique<backend::builtin::IOTensor>(
111       operand.info(),
112       ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */
113     );
114
115     // Add tensor to builtin TensorRegistry.
116     builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
117   }
118 }
119
120 backend::BackendContexts createBackendContexts(compiler::LoweredGraph &lgraph, bool linear_executor)
121 {
122   backend::BackendContexts contexts;
123   auto &backend_manager = compiler::BackendManager::get();
124
125   std::unordered_map<const backend::Backend *, backend::ContextData> context_data_map;
126
127   // Generate partial graphs for each backend
128   for (auto backend : backend_manager.getAll())
129   {
130     auto &data = context_data_map[backend];
131     auto graph = std::make_unique<ir::Graph>();
132     graph->setLayout(lgraph.graph().layout());
133     data.graph = std::move(graph);
134   }
135
136   auto &whole_graph = lgraph.graph();
137   // Separate operands into partial graphs
138   whole_graph.operands().iterate([&](const ir::OperandIndex &operand_ind, ir::Operand &operand) {
139     auto &operand_li = lgraph.lower_info().operand;
140     const auto &def_factors = operand_li.at(operand_ind).def_factors();
141     if (def_factors.size() == 0) // Ignore unused tensor
142       return;
143     const auto &def_factor = def_factors.getOnlyElement();
144     const auto backend = def_factor.backend();
145     auto &partial_graph = *context_data_map[backend].graph;
146     auto &operand_layouts = context_data_map[backend].operand_layouts;
147     assert(operand_layouts.find(operand_ind) == operand_layouts.end());
148     operand_layouts[operand_ind] = def_factor.layout();
149
150     // Copy the operand and insert it to the partial graph
151     auto new_operand = std::make_unique<ir::Operand>(operand);
152     new_operand->clearDefUse();
153     operand.releaseData(); // Deref data of LoweredGraph
154     auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
155     UNUSED_RELEASE(new_operand_ind);
156     assert(new_operand_ind == operand_ind);
157   });
158   // Separate operations into partial graphs
159   whole_graph.operations().iterate(
160     [&](const ir::OperationIndex &op_ind, const ir::Operation &operation) {
161       auto &op_li = lgraph.lower_info().operation;
162       auto backend = op_li.at(op_ind).backend();
163       auto &partial_graph = *context_data_map[backend].graph;
164       auto &external_operands = context_data_map[backend].external_operands;
165       auto &operand_layouts = context_data_map[backend].operand_layouts;
166
167       {
168         // Add missing operands (externals)
169         auto io_list = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
170                        ir::Remove::UNDEFINED;
171         for (auto operand_ind : io_list)
172         {
173           if (partial_graph.operands().exist(operand_ind))
174             continue;
175
176           // Copy the operand and insert it to the partial graph
177           const auto &operand = whole_graph.operands().at(operand_ind);
178           auto new_operand = std::make_unique<ir::Operand>(operand);
179           new_operand->clearDefUse();
180           auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
181           UNUSED_RELEASE(new_operand_ind);
182           assert(new_operand_ind == operand_ind);
183
184           auto layout =
185             lgraph.lower_info().operand.at(operand_ind).def_factors().getOnlyElement().layout();
186           assert(operand_layouts.find(operand_ind) == operand_layouts.end());
187           operand_layouts[operand_ind] = layout;
188           external_operands.add(operand_ind);
189         }
190
191         auto new_op_ind = partial_graph.addOperation(op_ind, clone(operation));
192         UNUSED_RELEASE(new_op_ind);
193         assert(new_op_ind == op_ind);
194       }
195     });
196
197   // Create contexts
198   auto whole_op_order = lgraph.graph().topolSortOperations();
199   for (auto &&pair : context_data_map)
200   {
201     auto backend = pair.first;
202     auto &data = pair.second;
203     // Handle graph input/outputs or external tensors
204     data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
205       if (whole_graph.getInputs().contains(ind) || whole_graph.getOutputs().contains(ind))
206         data.external_operands.add(ind);
207       // Inputs are either "graph input" or "no def op and non-constant"
208       if (whole_graph.getInputs().contains(ind) ||
209           (!operand.getDef().valid() && !operand.isConstant()))
210         // Outputs are either "graph output" or "no uses"
211         data.graph->addInput(ind);
212       if (whole_graph.getOutputs().contains(ind) || operand.getUses().size() == 0)
213         data.graph->addOutput(ind);
214     });
215     dumper::text::dumpGraph(*data.graph);
216
217     std::copy_if(whole_op_order.begin(), whole_op_order.end(), std::back_inserter(data.op_order),
218                  [&](const auto &ind) { return data.graph->operations().exist(ind); });
219     data.is_linear_executor = linear_executor;
220     data.custom_kernel_builder = lgraph.graph().getKernelBuilder();
221     contexts.emplace(backend, backend->newContext(std::move(data)));
222   }
223   return contexts;
224 }
225
226 } // namespace
227 } // namespace onert
228
229 namespace onert
230 {
231 namespace compiler
232 {
233
234 ExecutorFactory &ExecutorFactory::get()
235 {
236   static ExecutorFactory singleton;
237   return singleton;
238 }
239
240 ExecutorFactory::ExecutorFactory()
241 {
242   _map["Linear"] = createLinearExecutor;
243   _map["Dataflow"] =
244     std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
245               std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false);
246   _map["Parallel"] =
247     std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
248               std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true);
249 }
250
251 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
252                                          const util::TracingCtx *tracing_ctx,
253                                          const compiler::CompilerOptions &options,
254                                          const std::shared_ptr<exec::IExecutors> &executors,
255                                          const ir::ModelIndex &index)
256 {
257   return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors,
258                                    index);
259 }
260
261 void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
262                                             const backend::BackendContexts &backend_contexts)
263 {
264   TensorRegistries tensor_regs{backend_contexts, true};
265
266   lowered_graph.graph().operations().iterate(
267     [&](const ir::OperationIndex &op_ind, const ir::Operation &op) {
268       auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind);
269       auto &backend_ctx = backend_contexts.at(lower_info->backend());
270       for (auto ind :
271            (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
272       {
273         // If an Operation's input/output tensor does not have an own tensor object,
274         // it must be using migrant tensors, so find the tensor from other tensor registries and
275         // register it to the current tensor registry if it is portable
276         if (!backend_ctx->tensor_registry->getITensor(ind))
277         {
278           auto tensor = tensor_regs.getITensor(ind);
279           assert(tensor); // The tensor must have been registered
280           auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
281           if (ptensor)
282             backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
283         }
284       }
285     });
286 }
287
288 void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs,
289                                             const std::shared_ptr<exec::IExecutors> &executors,
290                                             const backend::BackendContexts &backend_contexts,
291                                             const ir::ModelIndex &index)
292 {
293   for (auto &&pair : backend_contexts)
294   {
295     auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
296     if (builtin_context != nullptr)
297     {
298       auto builtin_kernel_gen = builtin_context->kernel_gen;
299       builtin_kernel_gen->setTensorRegistries(tensor_regs);
300       builtin_kernel_gen->setExecutors(executors);
301       builtin_kernel_gen->setModelIndex(index);
302     }
303   }
304 }
305
306 std::deque<std::pair<const backend::Backend *, backend::BackendContext *>>
307 ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_contexts)
308 {
309   std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
310
311   for (auto &&pair : backend_contexts)
312   {
313     // NOTE builtin backend must be processed lastly.
314     // This is because of Permute layer's specialty which is the only operation that could have
315     // different ITensor objects for the input and the output. And it requires all other backends'
316     // tensors are ready to use.
317     if (pair.first->config()->id() == "builtin")
318       ordered_contexts.emplace_back(pair.first, pair.second.get());
319     else
320       ordered_contexts.emplace_front(pair.first, pair.second.get());
321   }
322
323   return ordered_contexts;
324 }
325
326 exec::IExecutor *ExecutorFactory::createLinearExecutor(
327   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
328   const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
329   const ir::ModelIndex &index)
330 {
331   auto &graph = lowered_graph->graph();
332
333   backend::BackendContexts backend_contexts =
334     createBackendContexts(*lowered_graph, options.executor == "Linear");
335
336   TensorRegistries tensor_regs{backend_contexts, true};
337
338   initializeSubgraphIOTensors(
339     *lowered_graph, backend_contexts,
340     (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
341       ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
342
343   // linearize
344   auto order = Linear::linearize(*lowered_graph);
345   Linear::dump(*lowered_graph, order);
346
347   for (auto &&pair : backend_contexts)
348   {
349     pair.second->genTensors();
350   }
351
352   prepareMigrantTensors(*lowered_graph, backend_contexts);
353
354   // Give some runtime objects to builtin KernelGenerator
355   prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
356
357   ExecutionBuilder builder;
358
359   // Adjust the order of backends for the upcoming iteration
360   auto ordered_contexts = orderBackendContext(backend_contexts);
361
362   // Simulate the execution for deallocation of tensors
363   std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
364   {
365     ir::OperandIndexMap<uint32_t> uses_map;
366     ir::OperandIndexSequence constants;
367
368     auto model_io =
369       (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
370
371     // Prepare scanning
372     graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
373       uses_map[ind] = obj.getUses().size();
374
375       if (obj.isConstant())
376         constants.append(ind);
377     });
378
379     // A trick to consider constants as an execption
380     for (const auto &ind : constants)
381     {
382       uses_map[ind]++;
383     }
384
385     for (const auto op_ind : order)
386     {
387       const auto &op = graph.operations().at(op_ind);
388       auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
389       auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
390
391       for (const auto &ind : op_inputs)
392       {
393         const auto &operand = graph.operands().at(ind);
394         assert(uses_map.find(ind) != uses_map.end());
395         assert(uses_map[ind] > 0);
396         uses_map[ind]--;
397         if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
398         {
399           dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
400         }
401       }
402     }
403
404     // Dispose and validate
405     for (const auto &ind : constants)
406     {
407       --uses_map[ind];
408     }
409
410     assert(
411       std::all_of(uses_map.begin(), uses_map.end(),
412                   [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
413   }
414
415   // Generate kernels
416   for (auto &&pair : ordered_contexts)
417   {
418     auto codes = pair.second->genKernels();
419     for (auto &&pair : codes)
420     {
421       auto &op_ind = pair.first;
422       auto &fn_seq = pair.second;
423       auto &op = lowered_graph->graph().operations().at(op_ind);
424       auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
425       if (options.he_profiling_mode)
426         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
427       if (!dealloc_list_map[op_ind].empty())
428         fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind]));
429       builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
430     }
431   }
432
433   auto code_map = builder.releaseCodeMap();
434
435   auto exec = new exec::LinearExecutor{std::move(lowered_graph),
436                                        std::move(backend_contexts),
437                                        tensor_regs,
438                                        std::move(code_map),
439                                        order,
440                                        tracing_ctx};
441
442   if (!options.trace_filepath.empty())
443   {
444     std::unique_ptr<exec::IExecutionObserver> ctp =
445       std::make_unique<exec::TracingObserver>(options.trace_filepath, exec->graph(), tracing_ctx);
446     exec->addObserver(std::move(ctp));
447   }
448
449   return exec;
450 }
451
452 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
453   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
454   const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
455   const ir::ModelIndex &index, bool parallel)
456 {
457   backend::BackendContexts backend_contexts =
458     createBackendContexts(*lowered_graph, options.executor == "Linear");
459
460   TensorRegistries tensor_regs{backend_contexts, true};
461
462   initializeSubgraphIOTensors(
463     *lowered_graph, backend_contexts,
464     (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
465       ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
466
467   for (auto &&pair : backend_contexts)
468   {
469     pair.second->genTensors();
470   }
471
472   prepareMigrantTensors(*lowered_graph, backend_contexts);
473
474   // Give some runtime objects to builtin KernelGenerator
475   prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
476
477   ExecutionBuilder builder;
478
479   // Adjust the order of backends for the upcoming iteration
480   auto ordered_contexts = orderBackendContext(backend_contexts);
481
482   // Generate kernels
483   for (auto &&pair : ordered_contexts)
484   {
485     auto codes = pair.second->genKernels();
486     for (auto &&pair : codes)
487     {
488       auto &op_ind = pair.first;
489       auto &fn_seq = pair.second;
490       auto &op = lowered_graph->graph().operations().at(op_ind);
491       auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
492       if (options.he_profiling_mode)
493         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
494       builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
495     }
496   }
497
498   auto code_map = builder.releaseCodeMap();
499
500   exec::ExecutorBase *exec = nullptr;
501   if (parallel)
502   {
503     exec = new exec::ParallelExecutor{std::move(lowered_graph), std::move(backend_contexts),
504                                       tensor_regs, std::move(code_map), tracing_ctx};
505   }
506   else
507   {
508     auto dataflow_exec =
509       new exec::DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs,
510                                  std::move(code_map), tracing_ctx};
511     if (options.he_profiling_mode)
512     {
513       std::vector<const backend::Backend *> backends;
514       for (const auto &pair : backend_contexts)
515       {
516         backends.push_back(pair.first);
517       }
518       auto et = std::make_shared<exec::ExecTime>(backends);
519       std::unique_ptr<exec::IExecutionObserver> obs =
520         std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
521       dataflow_exec->addObserver(std::move(obs));
522     }
523     exec = dataflow_exec;
524   }
525
526   if (!options.trace_filepath.empty())
527   {
528     std::unique_ptr<exec::IExecutionObserver> ctp =
529       std::make_unique<exec::TracingObserver>(options.trace_filepath, exec->graph(), tracing_ctx);
530     exec->addObserver(std::move(ctp));
531   }
532
533   return exec;
534 }
535
536 } // namespace compiler
537 } // namespace onert