2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "ExecutorFactory.h"
20 #include "exec/ExecutionObservers.h"
21 #include "exec/LinearExecutor.h"
22 #include "exec/DataflowExecutor.h"
23 #include "exec/ParallelExecutor.h"
24 #include "compiler/BackendManager.h"
25 #include "compiler/ExecutionBuilder.h"
26 #include "exec/ExecTime.h"
27 #include "compiler/Linear.h"
28 #include "compiler/TensorBuilders.h"
29 #include "backend/IConstantInitializer.h"
30 #include "backend/IKernelGenerator.h"
31 #include "backend/IOptimizer.h"
32 #include "backend/ITensorRegister.h"
33 #include "backend/controlflow/Config.h"
34 #include "backend/controlflow/KernelGenerator.h"
35 #include "backend/controlflow/UserTensor.h"
36 #include "backend/controlflow/TensorBuilder.h"
44 class SyncFunction final : public exec::IFunction
47 virtual ~SyncFunction() = default;
48 SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
49 : _fn{std::move(fn)}, _config{config}
61 void prepare() override { _fn->prepare(); }
64 std::unique_ptr<exec::IFunction> _fn;
65 std::shared_ptr<backend::IConfig> _config;
68 // TODO Think of a better way to manage TensorManagers
69 backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
71 backend::TensorManagerSet tensor_mgrs;
72 for (auto &tensor_builder : tensor_builders)
74 auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
75 if (s_tensor_manager != nullptr)
76 tensor_mgrs.insert(std::move(s_tensor_manager));
78 auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
79 if (d_tensor_manager != nullptr)
80 tensor_mgrs.insert(std::move(d_tensor_manager));
93 ExecutorFactory &ExecutorFactory::get()
95 static ExecutorFactory singleton;
99 ExecutorFactory::ExecutorFactory()
101 _map["Linear"] = createLinearExecutor;
102 _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
103 std::placeholders::_3, false);
104 _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
105 std::placeholders::_3, true);
108 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
109 const compiler::CompilerOptions &options,
110 const std::shared_ptr<exec::ExecutorMap> &executor_map)
112 return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
115 void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph)
119 std::vector<backend::BackendContext::OperationInfo> operation_list;
120 std::vector<ir::OperandIndex> operand_list;
122 std::unordered_map<const backend::Backend *, Entry> backend_assets;
124 // Build lists for operations
125 lowered_graph->op_seqs().iterate(
126 [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
127 auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
128 auto backend = op_seq_li.at(op_seq_index)->backend();
129 for (auto &operation_idx : op_seq.operations())
131 backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
135 // Build lists for operands
136 lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
137 const auto lower_info = lowered_graph->getLowerInfo(ind);
138 for (auto factor : lower_info->def_factors())
140 auto backend = factor.backend();
141 backend_assets[backend].operand_list.emplace_back(ind);
145 for (auto &pair : backend_assets)
147 auto backend = pair.first;
148 auto &arg = pair.second;
149 lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
153 void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph,
154 const std::vector<ir::OpSequenceIndex> &order)
156 for (const auto index : order)
158 const auto &op_seq = lowered_graph->op_seqs().at(index);
159 const auto backend = lowered_graph->getLowerInfo(index)->backend();
160 const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
161 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
162 auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs();
166 // Custom registration
167 tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
171 // Default registration
172 for (const auto op_idx : op_seq)
174 const auto &op = lowered_graph->graph().operations().at(op_idx);
175 for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
177 if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
179 const auto &operand_lower_info =
180 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
182 // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
183 // op.getOutputs() of permute (CPU) returns tensor A
184 // but tensor A belongs to the backend of acl_cl.
185 // So, we have to make this tensor NOT registered for CPU.
186 if (operand_lower_info.backend() != backend)
189 const auto &obj = lowered_graph->graph().operands().at(index);
190 const auto frontend_layout = op_seq.getLayout();
191 const auto backend_layout = operand_lower_info.layout();
192 ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
193 obj.typeInfo(), obj.info().memAllocType(),
195 tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
203 std::vector<std::shared_ptr<backend::ITensor>>
204 ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
205 const ir::OperandIndexSequence &indices)
207 std::vector<std::shared_ptr<backend::ITensor>> ret;
209 // TODO Store controlflow backend in BackendContext
210 std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
211 std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
212 for (const auto &e : lowered_graph.backend_contexts())
214 auto backend = e.first;
215 auto &context = e.second;
216 if (backend->config()->id() == backend::controlflow::Config::ID)
219 std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder);
221 std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
224 assert(cf_tensor_builder);
225 assert(cf_tensor_reg);
227 for (auto ind : indices)
229 const auto &operand = lowered_graph.graph().operands().at(ind);
230 auto tensor = std::make_shared<backend::controlflow::UserTensor>(
232 ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
233 cf_tensor_builder->dynamicTensorManager());
235 // Add tensor to controlflow TensorRegistry.
236 cf_tensor_reg->setNativeUserTensor(ind, tensor);
237 ret.push_back(tensor);
242 void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
244 TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
246 lowered_graph.op_seqs().iterate(
247 [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
248 auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
249 auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
250 for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
251 ir::Remove::UNDEFINED)
253 // If an OpSequence input/output tensor does not have a own tensor object,
254 // it must be using external tensors, so find the tensor from other tensor builders and
255 // set the tensor to this tensor builder if portable
256 if (!backend_ctx->tensor_registry->getITensor(ind))
258 auto tensor = tensor_regs.getITensor(ind);
259 assert(tensor); // The tensor must have been registered
260 auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
262 backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
269 ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
270 const compiler::CompilerOptions &options,
271 const std::shared_ptr<exec::ExecutorMap> &executor_map)
273 const auto &backend_contexts = lowered_graph->backend_contexts();
275 initializeBackendContext(lowered_graph.get());
278 assert(!lowered_graph->graph().isBuildingPhase());
280 /*************************************************
281 * Backend dependent analysis & optimization phase
282 *************************************************/
284 for (auto &pair : backend_contexts)
286 auto &optimizer = pair.second->optimizer;
288 optimizer->optimize();
291 /**********************************************************
292 * Backend dependent analysis & optimization phase finished
293 **********************************************************/
295 /***********************
296 * Code generation phase
297 ***********************/
299 auto order = Linear::linearize(*lowered_graph);
300 runTensorRegistration(lowered_graph.get(), order);
302 std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
303 std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
304 if (options.is_primary_subgraph)
306 input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
307 output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
310 Linear::dump(*lowered_graph, order);
311 Linear::planTensors(*lowered_graph, order);
313 TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
314 TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
316 for (auto &tensor_builder : tensor_builders)
318 tensor_builder->prepare();
321 prepareExternalTensors(*lowered_graph);
323 ExecutionBuilder builder;
326 lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
327 const ir::OpSequence &op_seq) {
328 auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
329 auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
330 // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
331 auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
332 if (cf_kernel_gen != nullptr)
334 cf_kernel_gen->setTensorRegistries(tensor_regs);
335 cf_kernel_gen->setExecutorMap(executor_map);
337 auto fn_seq = kernel_gen->generate(op_seq);
338 if (options.he_profiling_mode)
340 fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
342 builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
345 for (auto &tensor_builder : tensor_builders)
347 tensor_builder->allocate();
350 for (auto &pair : backend_contexts)
352 pair.second->initConsts();
355 lowered_graph->graph().operands().iterate(
356 [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
358 auto code_map = builder.releaseCodeMap();
360 for (auto &it : code_map)
362 auto op_seq_index = it.first;
363 auto &fn_seq = it.second.fn_seq;
365 fn_seq->iterate([&](exec::IFunction &ifunc) {
367 auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
368 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
369 tensor_builder->postFunctionPrepare();
373 backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
374 auto exec = new exec::LinearExecutor{
375 std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
376 std::move(tensor_mgrs), std::move(code_map), order};
378 if (!options.trace_filepath.empty())
380 std::unique_ptr<exec::IExecutionObserver> ctp =
381 std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
382 exec->addObserver(std::move(ctp));
388 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
389 std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
390 const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
392 const auto &backend_contexts = lowered_graph->backend_contexts();
394 initializeBackendContext(lowered_graph.get());
396 auto order = Linear::linearize(*lowered_graph);
397 runTensorRegistration(lowered_graph.get(), order);
399 std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
400 std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
401 if (options.is_primary_subgraph)
403 input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
404 output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
407 TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
408 TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
410 // To make tensors never be deallocated, this is a workaround to use static memory planner
411 for (auto &tensor_builder : tensor_builders)
413 lowered_graph->graph().operands().iterate(
414 [&](const ir::OperandIndex &ind, const ir::Operand &) {
415 if (tensor_builder->isRegistered(ind))
417 tensor_builder->notifyFirstUse(ind);
422 for (auto &tensor_builder : tensor_builders)
424 tensor_builder->prepare();
427 prepareExternalTensors(*lowered_graph);
429 ExecutionBuilder builder;
432 lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
433 const ir::OpSequence &op_seq) {
434 auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
435 auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
436 // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
437 auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
438 if (cf_kernel_gen != nullptr)
440 assert(cf_kernel_gen != nullptr);
441 cf_kernel_gen->setTensorRegistries(tensor_regs);
442 cf_kernel_gen->setExecutorMap(executor_map);
444 auto fn_seq = kernel_gen->generate(op_seq);
445 if (options.he_profiling_mode)
447 fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
449 builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
452 for (const auto &tensor_builder : tensor_builders)
454 tensor_builder->allocate();
457 for (auto &pair : backend_contexts)
459 pair.second->initConsts();
462 lowered_graph->graph().operands().iterate(
463 [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
465 auto code_map = builder.releaseCodeMap();
467 for (auto &it : code_map)
469 auto op_seq_index = it.first;
470 auto &fn_seq = it.second.fn_seq;
472 fn_seq->iterate([&](exec::IFunction &ifunc) {
474 auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
475 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
476 tensor_builder->postFunctionPrepare();
480 backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
482 exec::ExecutorBase *exec = nullptr;
485 exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
486 output_tensors, tensor_regs,
487 std::move(tensor_mgrs), std::move(code_map)};
491 auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
492 output_tensors, tensor_regs,
493 std::move(tensor_mgrs), std::move(code_map)};
494 if (options.he_profiling_mode)
496 std::vector<const backend::Backend *> backends;
497 for (const auto &pair : backend_contexts)
499 backends.push_back(pair.first);
501 auto et = std::make_shared<exec::ExecTime>(backends);
502 std::unique_ptr<exec::IExecutionObserver> obs =
503 std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
504 dataflow_exec->addObserver(std::move(obs));
506 exec = dataflow_exec;
509 if (!options.trace_filepath.empty())
511 std::unique_ptr<exec::IExecutionObserver> ctp =
512 std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
513 exec->addObserver(std::move(ctp));
519 } // namespace compiler