2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "ExecutorFactory.h"
20 #include "exec/ExecutionObservers.h"
21 #include "exec/LinearExecutor.h"
22 #include "exec/DataflowExecutor.h"
23 #include "exec/ParallelExecutor.h"
24 #include "compiler/BackendManager.h"
25 #include "compiler/ExecutionBuilder.h"
26 #include "exec/ExecTime.h"
27 #include "compiler/Linear.h"
28 #include "backend/IConstantInitializer.h"
29 #include "backend/IKernelGenerator.h"
30 #include "backend/IOptimizer.h"
31 #include "backend/ITensorRegister.h"
32 #include "backend/controlflow/Config.h"
33 #include "backend/controlflow/KernelGenerator.h"
34 #include "backend/controlflow/UserTensor.h"
35 #include "backend/controlflow/TensorBuilder.h"
43 class SyncFunction final : public exec::IFunction
46 virtual ~SyncFunction() = default;
47 SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
48 : _fn{std::move(fn)}, _config{config}
60 void prepare() override { _fn->prepare(); }
63 std::unique_ptr<exec::IFunction> _fn;
64 std::shared_ptr<backend::IConfig> _config;
75 ExecutorFactory &ExecutorFactory::get()
77 static ExecutorFactory singleton;
81 ExecutorFactory::ExecutorFactory()
83 _map["Linear"] = createLinearExecutor;
84 _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
85 std::placeholders::_3, false);
86 _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
87 std::placeholders::_3, true);
90 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<ir::LoweredGraph> lowered_graph,
91 const compiler::CompilerOptions &options,
92 const std::shared_ptr<exec::ExecutorMap> &executor_map)
94 return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
97 void ExecutorFactory::initializeBackendContext(ir::LoweredGraph *lowered_graph)
101 std::vector<backend::BackendContext::OperationInfo> operation_list;
102 std::vector<ir::OperandIndex> operand_list;
104 std::unordered_map<const backend::Backend *, Entry> backend_assets;
106 // Build lists for operations
107 lowered_graph->op_seqs().iterate(
108 [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
109 auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
110 auto backend = op_seq_li.at(op_seq_index)->backend();
111 for (auto &operation_idx : op_seq.operations())
113 backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
117 // Build lists for operands
118 lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
119 const auto lower_info = lowered_graph->getLowerInfo(ind);
120 for (auto factor : lower_info->def_factors())
122 auto backend = factor.backend();
123 backend_assets[backend].operand_list.emplace_back(ind);
127 for (auto &pair : backend_assets)
129 auto backend = pair.first;
130 auto &arg = pair.second;
131 lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
135 void ExecutorFactory::runTensorRegistration(ir::LoweredGraph *lowered_graph,
136 const std::vector<ir::OpSequenceIndex> &order)
138 for (const auto index : order)
140 const auto &op_seq = lowered_graph->op_seqs().at(index);
141 const auto backend = lowered_graph->getLowerInfo(index)->backend();
142 const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
143 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
146 // Custom registration
147 tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
151 // Default registration
152 for (const auto op_idx : op_seq)
154 const auto &op = lowered_graph->graph().operations().at(op_idx);
155 for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
157 if (!tensor_builder->isRegistered(index))
159 const auto &operand_lower_info =
160 lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
162 // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
163 // op.getOutputs() of permute (CPU) returns tensor A
164 // but tensor A belongs to the backend of acl_cl.
165 // So, we have to make this tensor NOT registered for CPU.
166 if (operand_lower_info.backend() != backend)
169 const auto &obj = lowered_graph->graph().operands().at(index);
170 const auto frontend_layout = op_seq.getLayout();
171 const auto backend_layout = operand_lower_info.layout();
172 ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
173 obj.typeInfo(), obj.info().memAllocType(),
175 tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
183 std::vector<std::shared_ptr<backend::ITensor>>
184 ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
185 const ir::OperandIndexSequence &indices)
187 std::vector<std::shared_ptr<backend::ITensor>> ret;
189 TensorBuilders tensor_builders{lowered_graph.backend_contexts(), false};
190 std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder =
191 tensor_builders.getControlflowTensorBuilder();
192 assert(cf_tensor_builder);
194 for (auto ind : indices)
196 const auto &operand = lowered_graph.graph().operands().at(ind);
197 auto tensor = std::make_shared<backend::controlflow::UserTensor>(
199 ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
200 cf_tensor_builder->dynamicTensorManager());
202 // Add tensor to controlflow TensorRegistry.
203 cf_tensor_builder->setUserTensor(ind, tensor);
204 ret.push_back(tensor);
209 void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
210 TensorBuilders &tensor_builders)
212 lowered_graph.op_seqs().iterate(
213 [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
214 auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
215 auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
216 for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
217 ir::Remove::UNDEFINED)
219 // If an OpSequence input/output tensor does not have a own tensor object,
220 // it must be using external tensors, so find the tensor from other tensor builders and
221 // set the tensor to this tensor builder if portable
222 if (!backend_ctx->tensor_builder->tensorAt(ind))
224 auto tensor = tensor_builders.getITensor(ind);
225 assert(tensor); // The tensor must have been created in one of TensorBuilders
226 auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
228 backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
235 ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
236 const compiler::CompilerOptions &options,
237 const std::shared_ptr<exec::ExecutorMap> &executor_map)
239 const auto &backend_contexts = lowered_graph->backend_contexts();
241 initializeBackendContext(lowered_graph.get());
244 assert(!lowered_graph->graph().isBuildingPhase());
246 /*************************************************
247 * Backend dependent analysis & optimization phase
248 *************************************************/
250 for (auto &pair : backend_contexts)
252 auto &optimizer = pair.second->optimizer;
254 optimizer->optimize();
257 /**********************************************************
258 * Backend dependent analysis & optimization phase finished
259 **********************************************************/
261 /***********************
262 * Code generation phase
263 ***********************/
265 auto order = Linear::linearize(*lowered_graph);
266 runTensorRegistration(lowered_graph.get(), order);
268 std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
269 std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
270 if (options.is_primary_subgraph)
272 input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
273 output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
276 Linear::dump(*lowered_graph, order);
277 Linear::planTensors(*lowered_graph, order);
279 TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
281 for (auto &tensor_builder : tensor_builders)
283 tensor_builder->prepare();
286 prepareExternalTensors(*lowered_graph, tensor_builders);
288 ExecutionBuilder builder;
291 lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
292 const ir::OpSequence &op_seq) {
293 auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
294 auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
295 // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
296 auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
297 if (cf_kernel_gen != nullptr)
299 cf_kernel_gen->setTensorBuilderSet(tensor_builders);
300 cf_kernel_gen->setExecutorMap(executor_map);
302 auto fn_seq = kernel_gen->generate(op_seq);
303 if (options.he_profiling_mode)
305 fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
307 builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
310 for (auto &tensor_builder : tensor_builders)
312 tensor_builder->allocate();
315 for (auto &pair : backend_contexts)
317 pair.second->initConsts();
320 lowered_graph->graph().operands().iterate(
321 [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
323 auto code_map = builder.releaseCodeMap();
325 for (auto &it : code_map)
327 auto op_seq_index = it.first;
328 auto &fn_seq = it.second.fn_seq;
330 fn_seq->iterate([&](exec::IFunction &ifunc) {
332 auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
333 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
334 tensor_builder->postFunctionPrepare();
339 new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors,
340 tensor_builders, std::move(code_map), order};
342 if (!options.trace_filepath.empty())
344 std::unique_ptr<exec::IExecutionObserver> ctp =
345 std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
346 exec->addObserver(std::move(ctp));
352 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
353 std::unique_ptr<ir::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
354 const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
356 const auto &backend_contexts = lowered_graph->backend_contexts();
358 initializeBackendContext(lowered_graph.get());
360 auto order = Linear::linearize(*lowered_graph);
361 runTensorRegistration(lowered_graph.get(), order);
363 std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
364 std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
365 if (options.is_primary_subgraph)
367 input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
368 output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
371 TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
373 // To make tensors never be deallocated, this is a workaround to use static memory planner
374 for (auto &tensor_builder : tensor_builders)
376 lowered_graph->graph().operands().iterate(
377 [&](const ir::OperandIndex &ind, const ir::Operand &) {
378 if (tensor_builder->isRegistered(ind))
380 tensor_builder->notifyFirstUse(ind);
385 for (auto &tensor_builder : tensor_builders)
387 tensor_builder->prepare();
390 prepareExternalTensors(*lowered_graph, tensor_builders);
392 ExecutionBuilder builder;
395 lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
396 const ir::OpSequence &op_seq) {
397 auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
398 auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
399 // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
400 auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
401 if (cf_kernel_gen != nullptr)
403 assert(cf_kernel_gen != nullptr);
404 cf_kernel_gen->setTensorBuilderSet(tensor_builders);
405 cf_kernel_gen->setExecutorMap(executor_map);
407 auto fn_seq = kernel_gen->generate(op_seq);
408 if (options.he_profiling_mode)
410 fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
412 builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
415 for (const auto &tensor_builder : tensor_builders)
417 tensor_builder->allocate();
420 for (auto &pair : backend_contexts)
422 pair.second->initConsts();
425 lowered_graph->graph().operands().iterate(
426 [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
428 auto code_map = builder.releaseCodeMap();
430 for (auto &it : code_map)
432 auto op_seq_index = it.first;
433 auto &fn_seq = it.second.fn_seq;
435 fn_seq->iterate([&](exec::IFunction &ifunc) {
437 auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
438 auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
439 tensor_builder->postFunctionPrepare();
443 exec::ExecutorBase *exec = nullptr;
446 exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
447 tensor_builders, std::move(code_map)};
452 new exec::DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors,
453 tensor_builders, std::move(code_map)};
454 if (options.he_profiling_mode)
456 std::vector<const backend::Backend *> backends;
457 for (const auto &pair : backend_contexts)
459 backends.push_back(pair.first);
461 auto et = std::make_shared<exec::ExecTime>(backends);
462 std::unique_ptr<exec::IExecutionObserver> obs =
463 std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
464 dataflow_exec->addObserver(std::move(obs));
466 exec = dataflow_exec;
469 if (!options.trace_filepath.empty())
471 std::unique_ptr<exec::IExecutionObserver> ctp =
472 std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
473 exec->addObserver(std::move(ctp));
479 } // namespace compiler