82afd9e560cd464a70db2c743304449e974c0697
[platform/core/ml/nnfw.git] / runtime / onert / core / src / compiler / ExecutorFactory.cc
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "ExecutorFactory.h"
18
19 #include <functional>
20 #include "exec/ExecutionObservers.h"
21 #include "exec/LinearExecutor.h"
22 #include "exec/DataflowExecutor.h"
23 #include "exec/ParallelExecutor.h"
24 #include "compiler/BackendManager.h"
25 #include "compiler/ExecutionBuilder.h"
26 #include "exec/ExecTime.h"
27 #include "compiler/Linear.h"
28 #include "backend/IConstantInitializer.h"
29 #include "backend/IKernelGenerator.h"
30 #include "backend/IOptimizer.h"
31 #include "backend/ITensorRegister.h"
32 #include "backend/controlflow/Config.h"
33 #include "backend/controlflow/KernelGenerator.h"
34 #include "backend/controlflow/UserTensor.h"
35 #include "backend/controlflow/TensorBuilder.h"
36 #include <memory>
37
38 namespace onert
39 {
40 namespace
41 {
42
43 class SyncFunction final : public exec::IFunction
44 {
45 public:
46   virtual ~SyncFunction() = default;
47   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
48       : _fn{std::move(fn)}, _config{config}
49   {
50     assert(_fn);
51     assert(_config);
52   }
53
54   void run() override
55   {
56     _fn->run();
57     _config->sync();
58   }
59
60   void prepare() override { _fn->prepare(); }
61
62 private:
63   std::unique_ptr<exec::IFunction> _fn;
64   std::shared_ptr<backend::IConfig> _config;
65 };
66
67 } // namespace
68 } // namespace onert
69
70 namespace onert
71 {
72 namespace compiler
73 {
74
75 ExecutorFactory &ExecutorFactory::get()
76 {
77   static ExecutorFactory singleton;
78   return singleton;
79 }
80
81 ExecutorFactory::ExecutorFactory()
82 {
83   _map["Linear"] = createLinearExecutor;
84   _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
85                                std::placeholders::_3, false);
86   _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
87                                std::placeholders::_3, true);
88 }
89
90 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<ir::LoweredGraph> lowered_graph,
91                                          const compiler::CompilerOptions &options,
92                                          const std::shared_ptr<exec::ExecutorMap> &executor_map)
93 {
94   return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
95 }
96
97 void ExecutorFactory::initializeBackendContext(ir::LoweredGraph *lowered_graph)
98 {
99   struct Entry
100   {
101     std::vector<backend::BackendContext::OperationInfo> operation_list;
102     std::vector<ir::OperandIndex> operand_list;
103   };
104   std::unordered_map<const backend::Backend *, Entry> backend_assets;
105
106   // Build lists for operations
107   lowered_graph->op_seqs().iterate(
108       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
109         auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
110         auto backend = op_seq_li.at(op_seq_index)->backend();
111         for (auto &operation_idx : op_seq.operations())
112         {
113           backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
114         }
115       });
116
117   // Build lists for operands
118   lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
119     const auto lower_info = lowered_graph->getLowerInfo(ind);
120     for (auto factor : lower_info->def_factors())
121     {
122       auto backend = factor.backend();
123       backend_assets[backend].operand_list.emplace_back(ind);
124     }
125   });
126
127   for (auto &pair : backend_assets)
128   {
129     auto backend = pair.first;
130     auto &arg = pair.second;
131     lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
132   }
133 }
134
135 void ExecutorFactory::runTensorRegistration(ir::LoweredGraph *lowered_graph,
136                                             const std::vector<ir::OpSequenceIndex> &order)
137 {
138   for (const auto index : order)
139   {
140     const auto &op_seq = lowered_graph->op_seqs().at(index);
141     const auto backend = lowered_graph->getLowerInfo(index)->backend();
142     const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
143     auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
144     if (tensor_register)
145     {
146       // Custom registration
147       tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
148     }
149     else
150     {
151       // Default registration
152       for (const auto op_idx : op_seq)
153       {
154         const auto &op = lowered_graph->graph().operations().at(op_idx);
155         for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
156         {
157           if (!tensor_builder->isRegistered(index))
158           {
159             const auto &operand_lower_info =
160                 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
161
162             // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
163             // op.getOutputs() of permute (CPU) returns tensor A
164             // but tensor A belongs to the backend of acl_cl.
165             // So, we have to make this tensor NOT registered for CPU.
166             if (operand_lower_info.backend() != backend)
167               continue;
168
169             const auto &obj = lowered_graph->graph().operands().at(index);
170             const auto frontend_layout = op_seq.getLayout();
171             const auto backend_layout = operand_lower_info.layout();
172             ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
173                                          obj.typeInfo(), obj.info().memAllocType(),
174                                          obj.isConstant()};
175             tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
176           }
177         }
178       }
179     }
180   }
181 }
182
183 std::vector<std::shared_ptr<backend::ITensor>>
184 ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
185                                           const ir::OperandIndexSequence &indices)
186 {
187   std::vector<std::shared_ptr<backend::ITensor>> ret;
188
189   TensorBuilders tensor_builders{lowered_graph.backend_contexts(), false};
190   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder =
191       tensor_builders.getControlflowTensorBuilder();
192   assert(cf_tensor_builder);
193
194   for (auto ind : indices)
195   {
196     const auto &operand = lowered_graph.graph().operands().at(ind);
197     auto tensor = std::make_shared<backend::controlflow::UserTensor>(
198         operand.info(),
199         ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
200         cf_tensor_builder->dynamicTensorManager());
201
202     // Add tensor to controlflow TensorRegistry.
203     cf_tensor_builder->setUserTensor(ind, tensor);
204     ret.push_back(tensor);
205   }
206   return ret;
207 }
208
209 void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
210                                              TensorBuilders &tensor_builders)
211 {
212   lowered_graph.op_seqs().iterate(
213       [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
214         auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
215         auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
216         for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
217                             ir::Remove::UNDEFINED)
218         {
219           // If an OpSequence input/output tensor does not have a own tensor object,
220           // it must be using external tensors, so find the tensor from other tensor builders and
221           // set the tensor to this tensor builder if portable
222           if (!backend_ctx->tensor_builder->tensorAt(ind))
223           {
224             auto tensor = tensor_builders.getITensor(ind);
225             assert(tensor); // The tensor must have been created in one of TensorBuilders
226             auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
227             if (ptensor)
228               backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
229           }
230         }
231       });
232 }
233
234 exec::IExecutor *
235 ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
236                                       const compiler::CompilerOptions &options,
237                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
238 {
239   const auto &backend_contexts = lowered_graph->backend_contexts();
240
241   initializeBackendContext(lowered_graph.get());
242
243   // linearize
244   assert(!lowered_graph->graph().isBuildingPhase());
245
246   /*************************************************
247    * Backend dependent analysis & optimization phase
248    *************************************************/
249
250   for (auto &pair : backend_contexts)
251   {
252     auto &optimizer = pair.second->optimizer;
253     if (optimizer)
254       optimizer->optimize();
255   }
256
257   /**********************************************************
258    * Backend dependent analysis & optimization phase finished
259    **********************************************************/
260
261   /***********************
262    * Code generation phase
263    ***********************/
264
265   auto order = Linear::linearize(*lowered_graph);
266   runTensorRegistration(lowered_graph.get(), order);
267
268   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
269   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
270   if (options.is_primary_subgraph)
271   {
272     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
273     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
274   }
275
276   Linear::dump(*lowered_graph, order);
277   Linear::planTensors(*lowered_graph, order);
278
279   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
280
281   for (auto &tensor_builder : tensor_builders)
282   {
283     tensor_builder->prepare();
284   }
285
286   prepareExternalTensors(*lowered_graph, tensor_builders);
287
288   ExecutionBuilder builder;
289
290   // Generate kernels
291   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
292                                         const ir::OpSequence &op_seq) {
293     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
294     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
295     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
296     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
297     if (cf_kernel_gen != nullptr)
298     {
299       cf_kernel_gen->setTensorBuilderSet(tensor_builders);
300       cf_kernel_gen->setExecutorMap(executor_map);
301     }
302     auto fn_seq = kernel_gen->generate(op_seq);
303     if (options.he_profiling_mode)
304     {
305       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
306     }
307     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
308   });
309
310   for (auto &tensor_builder : tensor_builders)
311   {
312     tensor_builder->allocate();
313   }
314
315   for (auto &pair : backend_contexts)
316   {
317     pair.second->initConsts();
318   }
319
320   lowered_graph->graph().operands().iterate(
321       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
322
323   auto code_map = builder.releaseCodeMap();
324
325   for (auto &it : code_map)
326   {
327     auto op_seq_index = it.first;
328     auto &fn_seq = it.second.fn_seq;
329
330     fn_seq->iterate([&](exec::IFunction &ifunc) {
331       ifunc.prepare();
332       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
333       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
334       tensor_builder->postFunctionPrepare();
335     });
336   }
337
338   auto exec =
339       new exec::LinearExecutor{std::move(lowered_graph), input_tensors,       output_tensors,
340                                tensor_builders,          std::move(code_map), order};
341
342   if (!options.trace_filepath.empty())
343   {
344     std::unique_ptr<exec::IExecutionObserver> ctp =
345         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
346     exec->addObserver(std::move(ctp));
347   }
348
349   return exec;
350 }
351
352 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
353     std::unique_ptr<ir::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
354     const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
355 {
356   const auto &backend_contexts = lowered_graph->backend_contexts();
357
358   initializeBackendContext(lowered_graph.get());
359
360   auto order = Linear::linearize(*lowered_graph);
361   runTensorRegistration(lowered_graph.get(), order);
362
363   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
364   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
365   if (options.is_primary_subgraph)
366   {
367     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
368     output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
369   }
370
371   TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
372
373   // To make tensors never be deallocated, this is a workaround to use static memory planner
374   for (auto &tensor_builder : tensor_builders)
375   {
376     lowered_graph->graph().operands().iterate(
377         [&](const ir::OperandIndex &ind, const ir::Operand &) {
378           if (tensor_builder->isRegistered(ind))
379           {
380             tensor_builder->notifyFirstUse(ind);
381           }
382         });
383   }
384
385   for (auto &tensor_builder : tensor_builders)
386   {
387     tensor_builder->prepare();
388   }
389
390   prepareExternalTensors(*lowered_graph, tensor_builders);
391
392   ExecutionBuilder builder;
393
394   // Generate kernels
395   lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
396                                         const ir::OpSequence &op_seq) {
397     auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
398     auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
399     // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
400     auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
401     if (cf_kernel_gen != nullptr)
402     {
403       assert(cf_kernel_gen != nullptr);
404       cf_kernel_gen->setTensorBuilderSet(tensor_builders);
405       cf_kernel_gen->setExecutorMap(executor_map);
406     }
407     auto fn_seq = kernel_gen->generate(op_seq);
408     if (options.he_profiling_mode)
409     {
410       fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
411     }
412     builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
413   });
414
415   for (const auto &tensor_builder : tensor_builders)
416   {
417     tensor_builder->allocate();
418   }
419
420   for (auto &pair : backend_contexts)
421   {
422     pair.second->initConsts();
423   }
424
425   lowered_graph->graph().operands().iterate(
426       [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
427
428   auto code_map = builder.releaseCodeMap();
429
430   for (auto &it : code_map)
431   {
432     auto op_seq_index = it.first;
433     auto &fn_seq = it.second.fn_seq;
434
435     fn_seq->iterate([&](exec::IFunction &ifunc) {
436       ifunc.prepare();
437       auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
438       auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
439       tensor_builder->postFunctionPrepare();
440     });
441   }
442
443   exec::ExecutorBase *exec = nullptr;
444   if (parallel)
445   {
446     exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
447                                       tensor_builders, std::move(code_map)};
448   }
449   else
450   {
451     auto dataflow_exec =
452         new exec::DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors,
453                                    tensor_builders, std::move(code_map)};
454     if (options.he_profiling_mode)
455     {
456       std::vector<const backend::Backend *> backends;
457       for (const auto &pair : backend_contexts)
458       {
459         backends.push_back(pair.first);
460       }
461       auto et = std::make_shared<exec::ExecTime>(backends);
462       std::unique_ptr<exec::IExecutionObserver> obs =
463           std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
464       dataflow_exec->addObserver(std::move(obs));
465     }
466     exec = dataflow_exec;
467   }
468
469   if (!options.trace_filepath.empty())
470   {
471     std::unique_ptr<exec::IExecutionObserver> ctp =
472         std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
473     exec->addObserver(std::move(ctp));
474   }
475
476   return exec;
477 }
478
479 } // namespace compiler
480 } // namespace onert