Update ACL pin to b309fc249e4383b4d40ae03e377c3cbad3f9f5f7
[platform/upstream/armnn.git] / src / armnn / LoadedNetwork.cpp
1 //
2 // Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5
6 #include "LoadedNetwork.hpp"
7 #include "Layer.hpp"
8 #include "Graph.hpp"
9 #include "Network.hpp"
10 #include <Processes.hpp>
11 #include "Profiling.hpp"
12 #include "HeapProfiling.hpp"
13
14 #include <armnn/BackendRegistry.hpp>
15 #include <armnn/Logging.hpp>
16 #include <armnn/utility/Assert.hpp>
17
18 #include <backendsCommon/CpuTensorHandle.hpp>
19 #include <armnn/backends/IMemoryManager.hpp>
20 #include <backendsCommon/MemCopyWorkload.hpp>
21 #include <backendsCommon/MemSyncWorkload.hpp>
22
23 #include <LabelsAndEventClasses.hpp>
24
25 #include <fmt/format.h>
26
27 namespace armnn
28 {
29
30 using namespace std;
31 using namespace armnn::profiling;
32
33 namespace
34 {
35
36 template <typename ExceptionType>
37 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
38 {
39     std::stringstream ss;
40     ss << prefix << " " << error.what();
41     return ss.str();
42 }
43
44 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
45                        const Layer& layer,
46                        ProfilingGuid networkGuid)
47 {
48     // Add layer to the post-optimisation network structure
49     std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
50     timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
51                                                networkGuid,
52                                                layerName,
53                                                LabelsAndEventClasses::LAYER_GUID);
54     for (auto&& input : layer.GetInputSlots())
55     {
56         const IOutputSlot* source = input.GetConnectedOutputSlot();
57         ARMNN_ASSERT(source != NULL);
58         timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
59                                                     source->GetOwningLayerGuid(),
60                                                     layer.GetGuid());
61     }
62 }
63
64 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
65                           std::unique_ptr<IWorkload>& workload,
66                           const Layer& layer)
67 {
68     // Add workload to the post-optimisation network structure
69     timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
70     timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
71                                        layer.GetBackendId().Get(),
72                                        LabelsAndEventClasses::BACKENDID_GUID);
73
74     // Link the workload to the layer
75     timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
76                                       layer.GetGuid(),
77                                       workload->GetGuid(),
78                                       LabelsAndEventClasses::CHILD_GUID);
79 }
80
81 } // anonymous
82
83 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<OptimizedNetwork> net,
84                                                                 std::string& errorMessage,
85                                                                 const INetworkProperties& networkProperties,
86                                                                 profiling::ProfilingService&  profilingService)
87 {
88     std::unique_ptr<LoadedNetwork> loadedNetwork;
89
90     auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
91     {
92         errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
93         ARMNN_LOG(error) << errorMessage;
94
95         return std::unique_ptr<LoadedNetwork>();
96     };
97
98     try
99     {
100         loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService));
101     }
102     catch (const armnn::RuntimeException& error)
103     {
104         return Fail(error);
105     }
106     catch (const armnn::Exception& error)
107     {
108         return Fail(error);
109     }
110     catch (const std::runtime_error& error)
111     {
112         return Fail(error);
113     }
114
115     return loadedNetwork;
116 }
117
118 LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net,
119                              const INetworkProperties& networkProperties,
120                              profiling::ProfilingService&  profilingService) :
121                              m_OptimizedNetwork(std::move(net)),
122                              m_IsImportEnabled(networkProperties.m_ImportEnabled),
123                              m_IsExportEnabled(networkProperties.m_ExportEnabled),
124                              m_TensorHandleFactoryRegistry(),
125                              m_ProfilingService(profilingService)
126 {
127     // Create a profiler and register it for the current thread.
128     m_Profiler = std::make_shared<Profiler>();
129     ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get());
130
131     Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort();
132     //First create tensor handlers, backends and workload factories.
133     //Handlers are created before workloads are.
134     //Because workload creation can modify some of the handlers,
135     //(for example the splitter and concat layers).
136     for (auto&& layer : order)
137     {
138         auto const& backendId = layer->GetBackendId();
139         if (m_Backends.count(backendId) == 0)
140         {
141             auto createBackend = BackendRegistryInstance().GetFactory(backendId);
142             auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
143
144             IBackendInternal* backend = it.first->second.get();
145
146             if (backend->SupportsTensorAllocatorAPI())
147             {
148                 auto workloadFactory = backend->CreateWorkloadFactory(
149                     m_TensorHandleFactoryRegistry, m_OptimizedNetwork->GetModelOptions());
150                 m_WorkloadFactories.emplace(
151                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
152             }
153             else
154             {
155                 IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
156                 auto workloadFactory = backend->CreateWorkloadFactory(
157                     memoryManager, m_OptimizedNetwork->GetModelOptions());
158
159                 m_WorkloadFactories.emplace(
160                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
161             }
162         }
163     }
164
165     for (auto&& layer : order)
166     {
167         auto& workloadFactory = GetWorkloadFactory(*layer);
168
169         switch (layer->GetType())
170         {
171         case LayerType::Input:
172         case LayerType::MemImport:
173             {
174                 // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles
175                 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsImportEnabled);
176                 break;
177             }
178         default:
179             {
180                 // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
181                 // If Export is enabled disable memory management so we can export, otherwise we do a copy
182                 if((layer->GetNumOutputSlots() == 1) &&
183                    (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
184                    (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
185                 {
186                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsExportEnabled);
187                 }
188                 else
189                 {
190                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
191                 }
192             }
193         }
194     }
195
196     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
197     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
198                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
199     if (timelineUtils)
200     {
201         timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
202         // Mark the network with a start of life event
203         timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
204         // and with the process ID
205         int processID = armnnUtils::Processes::GetCurrentId();
206         std::stringstream ss;
207         ss << processID;
208         timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
209     }
210
211     //Then create workloads.
212     for (auto&& layer : order)
213     {
214         if (timelineUtils)
215         {
216             // Add layer to the post-optimisation network structure
217             AddLayerStructure(timelineUtils, *layer, networkGuid);
218         }
219
220         const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
221
222         switch (layer->GetType())
223         {
224         case LayerType::Input:
225         case LayerType::Output:
226             {
227                 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
228                 break;
229             }
230         default:
231             {
232                 auto workload = layer->CreateWorkload(workloadFactory);
233
234                 if (!workload)
235                 {
236                     const char* const layerName =
237                         layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
238                     throw InvalidArgumentException(
239                         fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
240                                     layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
241                     ));
242                 }
243
244                 if (timelineUtils)
245                 {
246                     // Add workload to the post-optimisation network structure
247                     AddWorkloadStructure(timelineUtils, workload, *layer);
248                 }
249
250                 m_WorkloadQueue.push_back(move(workload));
251                 // release the constant data in the layer..
252                 layer->ReleaseConstantData();
253                 break;
254             }
255         }
256     }
257
258     for (auto&& workloadFactory : m_WorkloadFactories)
259     {
260         workloadFactory.second.first->AfterWorkloadsCreated();
261     }
262
263     if (timelineUtils)
264     {
265         // Commit to send the post-optimisation network structure
266         timelineUtils->Commit();
267     }
268
269     // Set up memory.
270     m_OptimizedNetwork->GetGraph().AllocateDynamicBuffers();
271
272     // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload.
273     for (auto& workload : m_WorkloadQueue)
274     {
275         workload->PostAllocationConfigure();
276     }
277 }
278
279 void LoadedNetwork::SendNetworkStructure()
280 {
281     Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort();
282     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
283
284     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
285                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
286
287     timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
288
289     for (auto&& layer : order)
290     {
291         // Add layer to the post-optimisation network structure
292         AddLayerStructure(timelineUtils, *layer, networkGuid);
293         switch (layer->GetType())
294         {
295         case LayerType::Input:
296         case LayerType::Output:
297         {
298             // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
299             break;
300         }
301         default:
302             {
303             for (auto& workload : m_WorkloadQueue)
304             {
305                 // Add workload to the post-optimisation network structure
306                 AddWorkloadStructure(timelineUtils, workload, *layer);
307             }
308             break;
309             }
310         }
311     }
312     // Commit to send the post-optimisation network structure
313     timelineUtils->Commit();
314 }
315
316 profiling::ProfilingGuid LoadedNetwork::GetNetworkGuid()
317 {
318     return m_OptimizedNetwork->GetGuid();
319 }
320
321 TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
322 {
323     for (auto&& inputLayer : m_OptimizedNetwork->GetGraph().GetInputLayers())
324     {
325         ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
326         if (inputLayer->GetBindingId() == layerId)
327         {
328             return inputLayer->GetOutputSlot(0).GetTensorInfo();
329         }
330     }
331
332     throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
333 }
334
335 TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
336 {
337     for (auto&& outputLayer : m_OptimizedNetwork->GetGraph().GetOutputLayers())
338     {
339         ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
340         ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
341         if (outputLayer->GetBindingId() == layerId)
342         {
343             return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
344         }
345     }
346
347     throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
348 }
349
350 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
351 {
352     const IWorkloadFactory* workloadFactory = nullptr;
353
354     auto it = m_WorkloadFactories.find(layer.GetBackendId());
355     if (it ==  m_WorkloadFactories.end())
356     {
357         throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
358                                            layer.GetBackendId().Get(),
359                                            layer.GetNameStr()),
360                                            CHECK_LOCATION());
361     }
362
363     workloadFactory = it->second.first.get();
364
365     ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
366
367     std::string reasonIfUnsupported;
368     ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer,
369                                                         {},
370                                                         reasonIfUnsupported,
371                                                         m_OptimizedNetwork->GetModelOptions()),
372         "Factory does not support layer");
373     IgnoreUnused(reasonIfUnsupported);
374     return *workloadFactory;
375 }
376
377 namespace {
378
379 // Non-copyable class owning accelerator-specific tensor data.
380 class TensorPin
381 {
382 public:
383     TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
384         : m_TensorHandle(std::move(handle))
385         , m_TensorInfo(info)
386         , m_Id(id)
387     {
388     }
389
390     ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
391     const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
392     LayerBindingId GetBindingId() const { return m_Id; }
393
394 private:
395     std::unique_ptr<ITensorHandle> m_TensorHandle;
396     TensorInfo m_TensorInfo;
397     LayerBindingId m_Id;
398 };
399
400 static const TensorPin& GetTensorPin(LayerBindingId id,
401     const std::vector<TensorPin>& pins,
402     char const* bindingPointDesc)
403 {
404     auto it = std::find_if(pins.begin(), pins.end(),
405         [id](const TensorPin& pin)
406     {
407         return pin.GetBindingId() == id;
408     });
409
410     if (it != pins.end())
411     {
412         return *it;
413     }
414     else
415     {
416         throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
417     }
418 }
419
420 // Stores data that needs to be kept accessible for the entire execution of a workload.
421 class WorkloadData
422 {
423 public:
424     WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
425     {
426         m_InputTensorPins.reserve(inputTensors.size());
427         m_OutputTensorPins.reserve(outputTensors.size());
428
429         for (auto inputTensorPair : inputTensors)
430         {
431             auto inputTensor = inputTensorPair.second;
432
433             std::unique_ptr<ITensorHandle> tensorHandle =
434                 std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
435             LayerBindingId layerId = inputTensorPair.first;
436
437             m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
438         }
439
440         for (auto outputTensorPair : outputTensors)
441         {
442             auto outputTensor = outputTensorPair.second;
443
444             std::unique_ptr<ITensorHandle> tensorHandle =
445                 std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
446             LayerBindingId layerId = outputTensorPair.first;
447
448             m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
449         }
450     }
451
452     const TensorPin& GetInputTensorPin(LayerBindingId id) const
453     {
454         return GetTensorPin(id, m_InputTensorPins, "input");
455     }
456
457     const TensorPin& GetOutputTensorPin(LayerBindingId id) const
458     {
459         return GetTensorPin(id, m_OutputTensorPins, "output");
460     }
461
462 private:
463
464     std::vector<TensorPin> m_InputTensorPins;
465     std::vector<TensorPin> m_OutputTensorPins;
466 };
467
468 }
469
470 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
471                                       const OutputTensors& outputTensors)
472 {
473     const Graph& graph = m_OptimizedNetwork->GetGraph();
474
475     // Walk graph to determine the order of execution.
476     if (graph.GetNumLayers() < 2)
477     {
478         ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
479         return Status::Failure;
480     }
481
482     // Data that must be kept alive for the entire execution of the workload.
483     WorkloadData workloadData(inputTensors, outputTensors);
484
485     if (graph.GetNumInputs() != inputTensors.size())
486     {
487         throw InvalidArgumentException("Number of inputs provided does not match network.");
488     }
489
490     // For each input to the network, call EnqueueInput with the data passed by the user.
491     {
492         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
493         m_InputQueue.clear();
494         m_InputQueue.reserve(graph.GetNumInputs());
495         for (const BindableLayer* inputLayer : graph.GetInputLayers())
496         {
497             const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
498             EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
499         }
500     }
501
502     // For each output to the network, call EnqueueOutput with the data passed by the user.
503     {
504         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
505         m_OutputQueue.clear();
506         m_OutputQueue.reserve(graph.GetNumOutputs());
507         for (const BindableLayer* outputLayer : graph.GetOutputLayers())
508         {
509             const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
510             EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
511         }
512     }
513
514     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
515                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
516     ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
517     if (timelineUtils)
518     {
519         // Add inference timeline trace if profiling is enabled.
520         ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
521         timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
522         timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
523                                           networkGuid,
524                                           inferenceGuid,
525                                           LabelsAndEventClasses::EXECUTION_OF_GUID);
526         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
527     }
528
529     bool executionSucceeded = true;
530
531     {
532         if (m_ProfilingService.IsProfilingEnabled())
533         {
534             m_ProfilingService.IncrementCounterValue(armnn::profiling::INFERENCES_RUN);
535         }
536         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
537         ARMNN_SCOPED_HEAP_PROFILING("Executing");
538         executionSucceeded = Execute(timelineUtils, inferenceGuid);
539     }
540
541     if (timelineUtils)
542     {
543         // Add end of life of the inference timeline if profiling is enabled.
544         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
545         timelineUtils->Commit();
546     }
547     return executionSucceeded ? Status::Success : Status::Failure;
548 }
549
550 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
551 {
552     if (layer.GetType() != LayerType::Input)
553     {
554         throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
555     }
556
557     if (tensorHandle == nullptr)
558     {
559         throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
560     }
561
562     InputQueueDescriptor inputQueueDescriptor;
563     WorkloadInfo info;
564
565     inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
566     info.m_InputTensorInfos.push_back(tensorInfo);
567
568     ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
569     const OutputHandler& handler = layer.GetOutputHandler();
570     const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
571     ITensorHandle* outputTensorHandle = handler.GetData();
572     ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
573                      "Data should have been allocated.");
574     inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
575     info.m_OutputTensorInfos.push_back(outputTensorInfo);
576
577     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
578     bool needMemCopy = true;
579     if (m_IsImportEnabled)  // Try import the input tensor
580     {
581         if(CheckFlag(importFlags, MemorySource::Malloc) )
582         {
583             needMemCopy = false;
584             // This assumes a CPU Tensor handle
585             void* mem = tensorHandle->Map(false);
586             if (outputTensorHandle->Import(mem, MemorySource::Malloc))
587             {
588                 tensorHandle->Unmap();
589                 return; // No need for a workload since the import has been done.
590             }
591             tensorHandle->Unmap();
592             throw MemoryImportException("EnqueueInput: Memory Import failed");
593         }
594     }
595     if (needMemCopy)
596     {
597         // Create a mem copy workload for input since we did not import
598         std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
599
600         ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
601
602         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
603                             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
604         if (timelineUtils)
605         {
606             // Add Input Workload to the post-optimisation network structure
607             AddWorkloadStructure(timelineUtils, inputWorkload, layer);
608             timelineUtils->Commit();
609         }
610
611         m_InputQueue.push_back(move(inputWorkload));
612     }
613 }
614
615 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
616 {
617     if (layer.GetType() != LayerType::Output)
618     {
619         throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
620     }
621
622     if (tensorHandle == nullptr)
623     {
624         throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
625     }
626
627     OutputQueueDescriptor outputQueueDescriptor;
628     WorkloadInfo info;
629
630     outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
631     info.m_OutputTensorInfos.push_back(tensorInfo);
632
633     ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
634
635     // Gets the output handler from the previous node.
636     const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
637
638     const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
639     ITensorHandle* inputTensorHandle = outputHandler.GetData();
640     ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
641
642     // Try import the output tensor.
643     // Note: We can only import the output pointer if all of the following  hold true:
644     // a) The imported pointer is aligned sufficiently
645     // b) The tensor has zero padding
646     // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
647     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
648     // e) m_IsExportEnabled must be set to true
649     bool needMemCopy = true;
650     if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
651     {
652         if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
653         {
654             MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
655             if (CheckFlag(importFlags, MemorySource::Malloc))
656             {
657                 needMemCopy = false;
658                 void *mem = tensorHandle->Map(false);
659                 bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
660                 tensorHandle->Unmap();
661
662                 if (importOk)
663                 {
664                     // Insert synchronization workload
665                     MemSyncQueueDescriptor syncDesc;
666                     syncDesc.m_Inputs.push_back(inputTensorHandle);
667                     info.m_InputTensorInfos.push_back(inputTensorInfo);
668                     auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
669                     ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
670                     m_OutputQueue.push_back(move(syncWorkload));
671                 }
672                 else
673                 {
674                     throw MemoryExportException("EnqueueOutput: Memory Export failed");
675                 }
676             }
677         }
678     }
679     if (needMemCopy)
680     {
681         // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
682         outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
683         info.m_InputTensorInfos.push_back(inputTensorInfo);
684
685         std::unique_ptr<IWorkload> outputWorkload =
686             std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
687         ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
688
689         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
690             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
691         if (timelineUtils)
692         {
693             // Add Output Workload to the post-optimisation network structure
694             AddWorkloadStructure(timelineUtils, outputWorkload, layer);
695             timelineUtils->Commit();
696         }
697
698         m_OutputQueue.push_back(move(outputWorkload));
699     }
700 }
701
702 void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
703 {
704     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
705
706     // this unused parameter makes sure we can only call this function with a valid lock
707     IgnoreUnused(lock);
708
709     if (m_IsWorkingMemAllocated)
710     {
711         return;
712     }
713     for (auto&& workloadFactory : m_WorkloadFactories)
714     {
715         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
716         if (memoryManager)
717         {
718             memoryManager->Acquire();
719         }
720     }
721     m_TensorHandleFactoryRegistry.AquireMemory();
722     m_IsWorkingMemAllocated = true;
723 }
724
725 void LoadedNetwork::FreeWorkingMemory()
726 {
727     std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
728     if (!m_IsWorkingMemAllocated)
729     {
730         return;
731     }
732     // Informs the memory managers to release memory in it's respective memory group
733     for (auto&& workloadFactory : m_WorkloadFactories)
734     {
735         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
736         if (memoryManager)
737         {
738             memoryManager->Release();
739         }
740     }
741     m_TensorHandleFactoryRegistry.ReleaseMemory();
742     m_IsWorkingMemAllocated = false;
743 }
744
745 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
746                             profiling::ProfilingGuid inferenceGuid)
747 {
748     bool success = true;
749
750     auto Fail = [&](const std::exception& error)
751     {
752         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
753         success = false;
754     };
755
756     try
757     {
758         std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
759         AllocateWorkingMemory(lockGuard);
760
761         ProfilingDynamicGuid workloadInferenceID(0);
762         auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
763         {
764             for (auto& workload : queue)
765             {
766                 if(timelineUtils)
767                 {
768                     workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
769                                                                                                     inferenceGuid);
770                 }
771                 workload->Execute();
772                 if(timelineUtils)
773                 {
774                     timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
775                 }
776             }
777         };
778
779         ExecuteQueue(m_InputQueue);
780         ExecuteQueue(m_WorkloadQueue);
781         ExecuteQueue(m_OutputQueue);
782     }
783     catch (const RuntimeException& error)
784     {
785         Fail(error);
786     }
787     catch (const std::runtime_error& error)
788     {
789         Fail(error);
790     }
791
792     return success;
793 }
794
795 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
796 {
797     for (auto&& workloadPtr: m_WorkloadQueue)
798     {
799         workloadPtr.get()->RegisterDebugCallback(func);
800     }
801 }
802
803 }