src/armnn/LoadedNetwork.cpp

   1 //
   2 // Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
   3 // SPDX-License-Identifier: MIT
   4 //
   5
   6 #include "LoadedNetwork.hpp"
   7 #include "Layer.hpp"
   8 #include "Graph.hpp"
   9 #include "Network.hpp"
  10 #include <Processes.hpp>
  11 #include "Profiling.hpp"
  12 #include "HeapProfiling.hpp"
  13
  14 #include <armnn/BackendRegistry.hpp>
  15 #include <armnn/Logging.hpp>
  16 #include <armnn/utility/Assert.hpp>
  17
  18 #include <backendsCommon/CpuTensorHandle.hpp>
  19 #include <armnn/backends/IMemoryManager.hpp>
  20 #include <backendsCommon/MemCopyWorkload.hpp>
  21 #include <backendsCommon/MemSyncWorkload.hpp>
  22
  23 #include <LabelsAndEventClasses.hpp>
  24
  25 #include <fmt/format.h>
  26
  27 namespace armnn
  28 {
  29
  30 using namespace std;
  31 using namespace armnn::profiling;
  32
  33 namespace
  34 {
  35
  36 template <typename ExceptionType>
  37 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
  38 {
  39     std::stringstream ss;
  40     ss << prefix << " " << error.what();
  41     return ss.str();
  42 }
  43
  44 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
  45                        const Layer& layer,
  46                        ProfilingGuid networkGuid)
  47 {
  48     // Add layer to the post-optimisation network structure
  49     std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
  50     timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
  51                                                networkGuid,
  52                                                layerName,
  53                                                LabelsAndEventClasses::LAYER_GUID);
  54     for (auto&& input : layer.GetInputSlots())
  55     {
  56         const IOutputSlot* source = input.GetConnectedOutputSlot();
  57         ARMNN_ASSERT(source != NULL);
  58         timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
  59                                                     source->GetOwningLayerGuid(),
  60                                                     layer.GetGuid());
  61     }
  62 }
  63
  64 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
  65                           std::unique_ptr<IWorkload>& workload,
  66                           const Layer& layer)
  67 {
  68     // Add workload to the post-optimisation network structure
  69     timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
  70     timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
  71                                        layer.GetBackendId().Get(),
  72                                        LabelsAndEventClasses::BACKENDID_GUID);
  73
  74     // Link the workload to the layer
  75     timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
  76                                       layer.GetGuid(),
  77                                       workload->GetGuid(),
  78                                       LabelsAndEventClasses::CHILD_GUID);
  79 }
  80
  81 } // anonymous
  82
  83 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<OptimizedNetwork> net,
  84                                                                 std::string& errorMessage,
  85                                                                 const INetworkProperties& networkProperties,
  86                                                                 profiling::ProfilingService&  profilingService)
  87 {
  88     std::unique_ptr<LoadedNetwork> loadedNetwork;
  89
  90     auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
  91     {
  92         errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
  93         ARMNN_LOG(error) << errorMessage;
  94
  95         return std::unique_ptr<LoadedNetwork>();
  96     };
  97
  98     try
  99     {
 100         loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService));
 101     }
 102     catch (const armnn::RuntimeException& error)
 103     {
 104         return Fail(error);
 105     }
 106     catch (const armnn::Exception& error)
 107     {
 108         return Fail(error);
 109     }
 110     catch (const std::runtime_error& error)
 111     {
 112         return Fail(error);
 113     }
 114
 115     return loadedNetwork;
 116 }
 117
 118 LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net,
 119                              const INetworkProperties& networkProperties,
 120                              profiling::ProfilingService&  profilingService) :
 121                              m_OptimizedNetwork(std::move(net)),
 122                              m_IsImportEnabled(networkProperties.m_ImportEnabled),
 123                              m_IsExportEnabled(networkProperties.m_ExportEnabled),
 124                              m_TensorHandleFactoryRegistry(),
 125                              m_ProfilingService(profilingService)
 126 {
 127     // Create a profiler and register it for the current thread.
 128     m_Profiler = std::make_shared<Profiler>();
 129     ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get());
 130
 131     Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort();
 132     //First create tensor handlers, backends and workload factories.
 133     //Handlers are created before workloads are.
 134     //Because workload creation can modify some of the handlers,
 135     //(for example the splitter and concat layers).
 136     for (auto&& layer : order)
 137     {
 138         auto const& backendId = layer->GetBackendId();
 139         if (m_Backends.count(backendId) == 0)
 140         {
 141             auto createBackend = BackendRegistryInstance().GetFactory(backendId);
 142             auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
 143
 144             IBackendInternal* backend = it.first->second.get();
 145
 146             if (backend->SupportsTensorAllocatorAPI())
 147             {
 148                 auto workloadFactory = backend->CreateWorkloadFactory(
 149                     m_TensorHandleFactoryRegistry, m_OptimizedNetwork->GetModelOptions());
 150                 m_WorkloadFactories.emplace(
 151                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
 152             }
 153             else
 154             {
 155                 IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
 156                 auto workloadFactory = backend->CreateWorkloadFactory(
 157                     memoryManager, m_OptimizedNetwork->GetModelOptions());
 158
 159                 m_WorkloadFactories.emplace(
 160                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
 161             }
 162         }
 163     }
 164
 165     for (auto&& layer : order)
 166     {
 167         auto& workloadFactory = GetWorkloadFactory(*layer);
 168
 169         switch (layer->GetType())
 170         {
 171         case LayerType::Input:
 172         case LayerType::MemImport:
 173             {
 174                 // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles
 175                 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsImportEnabled);
 176                 break;
 177             }
 178         default:
 179             {
 180                 // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
 181                 // If Export is enabled disable memory management so we can export, otherwise we do a copy
 182                 if((layer->GetNumOutputSlots() == 1) &&
 183                    (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
 184                    (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
 185                 {
 186                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsExportEnabled);
 187                 }
 188                 else
 189                 {
 190                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
 191                 }
 192             }
 193         }
 194     }
 195
 196     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
 197     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
 198                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
 199     if (timelineUtils)
 200     {
 201         timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
 202         // Mark the network with a start of life event
 203         timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
 204         // and with the process ID
 205         int processID = armnnUtils::Processes::GetCurrentId();
 206         std::stringstream ss;
 207         ss << processID;
 208         timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
 209     }
 210
 211     //Then create workloads.
 212     for (auto&& layer : order)
 213     {
 214         if (timelineUtils)
 215         {
 216             // Add layer to the post-optimisation network structure
 217             AddLayerStructure(timelineUtils, *layer, networkGuid);
 218         }
 219
 220         const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
 221
 222         switch (layer->GetType())
 223         {
 224         case LayerType::Input:
 225         case LayerType::Output:
 226             {
 227                 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
 228                 break;
 229             }
 230         default:
 231             {
 232                 auto workload = layer->CreateWorkload(workloadFactory);
 233
 234                 if (!workload)
 235                 {
 236                     const char* const layerName =
 237                         layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
 238                     throw InvalidArgumentException(
 239                         fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
 240                                     layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
 241                     ));
 242                 }
 243
 244                 if (timelineUtils)
 245                 {
 246                     // Add workload to the post-optimisation network structure
 247                     AddWorkloadStructure(timelineUtils, workload, *layer);
 248                 }
 249
 250                 m_WorkloadQueue.push_back(move(workload));
 251                 // release the constant data in the layer..
 252                 layer->ReleaseConstantData();
 253                 break;
 254             }
 255         }
 256     }
 257
 258     for (auto&& workloadFactory : m_WorkloadFactories)
 259     {
 260         workloadFactory.second.first->AfterWorkloadsCreated();
 261     }
 262
 263     if (timelineUtils)
 264     {
 265         // Commit to send the post-optimisation network structure
 266         timelineUtils->Commit();
 267     }
 268
 269     // Set up memory.
 270     m_OptimizedNetwork->GetGraph().AllocateDynamicBuffers();
 271
 272     // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload.
 273     for (auto& workload : m_WorkloadQueue)
 274     {
 275         workload->PostAllocationConfigure();
 276     }
 277 }
 278
 279 void LoadedNetwork::SendNetworkStructure()
 280 {
 281     Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort();
 282     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
 283
 284     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
 285                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
 286
 287     timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
 288
 289     for (auto&& layer : order)
 290     {
 291         // Add layer to the post-optimisation network structure
 292         AddLayerStructure(timelineUtils, *layer, networkGuid);
 293         switch (layer->GetType())
 294         {
 295         case LayerType::Input:
 296         case LayerType::Output:
 297         {
 298             // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
 299             break;
 300         }
 301         default:
 302             {
 303             for (auto& workload : m_WorkloadQueue)
 304             {
 305                 // Add workload to the post-optimisation network structure
 306                 AddWorkloadStructure(timelineUtils, workload, *layer);
 307             }
 308             break;
 309             }
 310         }
 311     }
 312     // Commit to send the post-optimisation network structure
 313     timelineUtils->Commit();
 314 }
 315
 316 profiling::ProfilingGuid LoadedNetwork::GetNetworkGuid()
 317 {
 318     return m_OptimizedNetwork->GetGuid();
 319 }
 320
 321 TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
 322 {
 323     for (auto&& inputLayer : m_OptimizedNetwork->GetGraph().GetInputLayers())
 324     {
 325         ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
 326         if (inputLayer->GetBindingId() == layerId)
 327         {
 328             return inputLayer->GetOutputSlot(0).GetTensorInfo();
 329         }
 330     }
 331
 332     throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
 333 }
 334
 335 TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
 336 {
 337     for (auto&& outputLayer : m_OptimizedNetwork->GetGraph().GetOutputLayers())
 338     {
 339         ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
 340         ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
 341         if (outputLayer->GetBindingId() == layerId)
 342         {
 343             return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
 344         }
 345     }
 346
 347     throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
 348 }
 349
 350 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
 351 {
 352     const IWorkloadFactory* workloadFactory = nullptr;
 353
 354     auto it = m_WorkloadFactories.find(layer.GetBackendId());
 355     if (it ==  m_WorkloadFactories.end())
 356     {
 357         throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
 358                                            layer.GetBackendId().Get(),
 359                                            layer.GetNameStr()),
 360                                            CHECK_LOCATION());
 361     }
 362
 363     workloadFactory = it->second.first.get();
 364
 365     ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
 366
 367     std::string reasonIfUnsupported;
 368     ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer,
 369                                                         {},
 370                                                         reasonIfUnsupported,
 371                                                         m_OptimizedNetwork->GetModelOptions()),
 372         "Factory does not support layer");
 373     IgnoreUnused(reasonIfUnsupported);
 374     return *workloadFactory;
 375 }
 376
 377 namespace {
 378
 379 // Non-copyable class owning accelerator-specific tensor data.
 380 class TensorPin
 381 {
 382 public:
 383     TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
 384         : m_TensorHandle(std::move(handle))
 385         , m_TensorInfo(info)
 386         , m_Id(id)
 387     {
 388     }
 389
 390     ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
 391     const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
 392     LayerBindingId GetBindingId() const { return m_Id; }
 393
 394 private:
 395     std::unique_ptr<ITensorHandle> m_TensorHandle;
 396     TensorInfo m_TensorInfo;
 397     LayerBindingId m_Id;
 398 };
 399
 400 static const TensorPin& GetTensorPin(LayerBindingId id,
 401     const std::vector<TensorPin>& pins,
 402     char const* bindingPointDesc)
 403 {
 404     auto it = std::find_if(pins.begin(), pins.end(),
 405         [id](const TensorPin& pin)
 406     {
 407         return pin.GetBindingId() == id;
 408     });
 409
 410     if (it != pins.end())
 411     {
 412         return *it;
 413     }
 414     else
 415     {
 416         throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
 417     }
 418 }
 419
 420 // Stores data that needs to be kept accessible for the entire execution of a workload.
 421 class WorkloadData
 422 {
 423 public:
 424     WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
 425     {
 426         m_InputTensorPins.reserve(inputTensors.size());
 427         m_OutputTensorPins.reserve(outputTensors.size());
 428
 429         for (auto inputTensorPair : inputTensors)
 430         {
 431             auto inputTensor = inputTensorPair.second;
 432
 433             std::unique_ptr<ITensorHandle> tensorHandle =
 434                 std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
 435             LayerBindingId layerId = inputTensorPair.first;
 436
 437             m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
 438         }
 439
 440         for (auto outputTensorPair : outputTensors)
 441         {
 442             auto outputTensor = outputTensorPair.second;
 443
 444             std::unique_ptr<ITensorHandle> tensorHandle =
 445                 std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
 446             LayerBindingId layerId = outputTensorPair.first;
 447
 448             m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
 449         }
 450     }
 451
 452     const TensorPin& GetInputTensorPin(LayerBindingId id) const
 453     {
 454         return GetTensorPin(id, m_InputTensorPins, "input");
 455     }
 456
 457     const TensorPin& GetOutputTensorPin(LayerBindingId id) const
 458     {
 459         return GetTensorPin(id, m_OutputTensorPins, "output");
 460     }
 461
 462 private:
 463
 464     std::vector<TensorPin> m_InputTensorPins;
 465     std::vector<TensorPin> m_OutputTensorPins;
 466 };
 467
 468 }
 469
 470 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
 471                                       const OutputTensors& outputTensors)
 472 {
 473     const Graph& graph = m_OptimizedNetwork->GetGraph();
 474
 475     // Walk graph to determine the order of execution.
 476     if (graph.GetNumLayers() < 2)
 477     {
 478         ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
 479         return Status::Failure;
 480     }
 481
 482     // Data that must be kept alive for the entire execution of the workload.
 483     WorkloadData workloadData(inputTensors, outputTensors);
 484
 485     if (graph.GetNumInputs() != inputTensors.size())
 486     {
 487         throw InvalidArgumentException("Number of inputs provided does not match network.");
 488     }
 489
 490     // For each input to the network, call EnqueueInput with the data passed by the user.
 491     {
 492         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
 493         m_InputQueue.clear();
 494         m_InputQueue.reserve(graph.GetNumInputs());
 495         for (const BindableLayer* inputLayer : graph.GetInputLayers())
 496         {
 497             const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
 498             EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
 499         }
 500     }
 501
 502     // For each output to the network, call EnqueueOutput with the data passed by the user.
 503     {
 504         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
 505         m_OutputQueue.clear();
 506         m_OutputQueue.reserve(graph.GetNumOutputs());
 507         for (const BindableLayer* outputLayer : graph.GetOutputLayers())
 508         {
 509             const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
 510             EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
 511         }
 512     }
 513
 514     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
 515                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
 516     ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
 517     if (timelineUtils)
 518     {
 519         // Add inference timeline trace if profiling is enabled.
 520         ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
 521         timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
 522         timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
 523                                           networkGuid,
 524                                           inferenceGuid,
 525                                           LabelsAndEventClasses::EXECUTION_OF_GUID);
 526         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
 527     }
 528
 529     bool executionSucceeded = true;
 530
 531     {
 532         if (m_ProfilingService.IsProfilingEnabled())
 533         {
 534             m_ProfilingService.IncrementCounterValue(armnn::profiling::INFERENCES_RUN);
 535         }
 536         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
 537         ARMNN_SCOPED_HEAP_PROFILING("Executing");
 538         executionSucceeded = Execute(timelineUtils, inferenceGuid);
 539     }
 540
 541     if (timelineUtils)
 542     {
 543         // Add end of life of the inference timeline if profiling is enabled.
 544         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
 545         timelineUtils->Commit();
 546     }
 547     return executionSucceeded ? Status::Success : Status::Failure;
 548 }
 549
 550 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
 551 {
 552     if (layer.GetType() != LayerType::Input)
 553     {
 554         throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
 555     }
 556
 557     if (tensorHandle == nullptr)
 558     {
 559         throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
 560     }
 561
 562     InputQueueDescriptor inputQueueDescriptor;
 563     WorkloadInfo info;
 564
 565     inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
 566     info.m_InputTensorInfos.push_back(tensorInfo);
 567
 568     ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
 569     const OutputHandler& handler = layer.GetOutputHandler();
 570     const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
 571     ITensorHandle* outputTensorHandle = handler.GetData();
 572     ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
 573                      "Data should have been allocated.");
 574     inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
 575     info.m_OutputTensorInfos.push_back(outputTensorInfo);
 576
 577     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
 578     bool needMemCopy = true;
 579     if (m_IsImportEnabled)  // Try import the input tensor
 580     {
 581         if(CheckFlag(importFlags, MemorySource::Malloc) )
 582         {
 583             needMemCopy = false;
 584             // This assumes a CPU Tensor handle
 585             void* mem = tensorHandle->Map(false);
 586             if (outputTensorHandle->Import(mem, MemorySource::Malloc))
 587             {
 588                 tensorHandle->Unmap();
 589                 return; // No need for a workload since the import has been done.
 590             }
 591             tensorHandle->Unmap();
 592             throw MemoryImportException("EnqueueInput: Memory Import failed");
 593         }
 594     }
 595     if (needMemCopy)
 596     {
 597         // Create a mem copy workload for input since we did not import
 598         std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
 599
 600         ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
 601
 602         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
 603                             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
 604         if (timelineUtils)
 605         {
 606             // Add Input Workload to the post-optimisation network structure
 607             AddWorkloadStructure(timelineUtils, inputWorkload, layer);
 608             timelineUtils->Commit();
 609         }
 610
 611         m_InputQueue.push_back(move(inputWorkload));
 612     }
 613 }
 614
 615 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
 616 {
 617     if (layer.GetType() != LayerType::Output)
 618     {
 619         throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
 620     }
 621
 622     if (tensorHandle == nullptr)
 623     {
 624         throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
 625     }
 626
 627     OutputQueueDescriptor outputQueueDescriptor;
 628     WorkloadInfo info;
 629
 630     outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
 631     info.m_OutputTensorInfos.push_back(tensorInfo);
 632
 633     ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
 634
 635     // Gets the output handler from the previous node.
 636     const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
 637
 638     const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
 639     ITensorHandle* inputTensorHandle = outputHandler.GetData();
 640     ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
 641
 642     // Try import the output tensor.
 643     // Note: We can only import the output pointer if all of the following  hold true:
 644     // a) The imported pointer is aligned sufficiently
 645     // b) The tensor has zero padding
 646     // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
 647     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
 648     // e) m_IsExportEnabled must be set to true
 649     bool needMemCopy = true;
 650     if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
 651     {
 652         if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
 653         {
 654             MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
 655             if (CheckFlag(importFlags, MemorySource::Malloc))
 656             {
 657                 needMemCopy = false;
 658                 void *mem = tensorHandle->Map(false);
 659                 bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
 660                 tensorHandle->Unmap();
 661
 662                 if (importOk)
 663                 {
 664                     // Insert synchronization workload
 665                     MemSyncQueueDescriptor syncDesc;
 666                     syncDesc.m_Inputs.push_back(inputTensorHandle);
 667                     info.m_InputTensorInfos.push_back(inputTensorInfo);
 668                     auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
 669                     ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
 670                     m_OutputQueue.push_back(move(syncWorkload));
 671                 }
 672                 else
 673                 {
 674                     throw MemoryExportException("EnqueueOutput: Memory Export failed");
 675                 }
 676             }
 677         }
 678     }
 679     if (needMemCopy)
 680     {
 681         // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
 682         outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
 683         info.m_InputTensorInfos.push_back(inputTensorInfo);
 684
 685         std::unique_ptr<IWorkload> outputWorkload =
 686             std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
 687         ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
 688
 689         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
 690             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
 691         if (timelineUtils)
 692         {
 693             // Add Output Workload to the post-optimisation network structure
 694             AddWorkloadStructure(timelineUtils, outputWorkload, layer);
 695             timelineUtils->Commit();
 696         }
 697
 698         m_OutputQueue.push_back(move(outputWorkload));
 699     }
 700 }
 701
 702 void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
 703 {
 704     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
 705
 706     // this unused parameter makes sure we can only call this function with a valid lock
 707     IgnoreUnused(lock);
 708
 709     if (m_IsWorkingMemAllocated)
 710     {
 711         return;
 712     }
 713     for (auto&& workloadFactory : m_WorkloadFactories)
 714     {
 715         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
 716         if (memoryManager)
 717         {
 718             memoryManager->Acquire();
 719         }
 720     }
 721     m_TensorHandleFactoryRegistry.AquireMemory();
 722     m_IsWorkingMemAllocated = true;
 723 }
 724
 725 void LoadedNetwork::FreeWorkingMemory()
 726 {
 727     std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
 728     if (!m_IsWorkingMemAllocated)
 729     {
 730         return;
 731     }
 732     // Informs the memory managers to release memory in it's respective memory group
 733     for (auto&& workloadFactory : m_WorkloadFactories)
 734     {
 735         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
 736         if (memoryManager)
 737         {
 738             memoryManager->Release();
 739         }
 740     }
 741     m_TensorHandleFactoryRegistry.ReleaseMemory();
 742     m_IsWorkingMemAllocated = false;
 743 }
 744
 745 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
 746                             profiling::ProfilingGuid inferenceGuid)
 747 {
 748     bool success = true;
 749
 750     auto Fail = [&](const std::exception& error)
 751     {
 752         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
 753         success = false;
 754     };
 755
 756     try
 757     {
 758         std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
 759         AllocateWorkingMemory(lockGuard);
 760
 761         ProfilingDynamicGuid workloadInferenceID(0);
 762         auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
 763         {
 764             for (auto& workload : queue)
 765             {
 766                 if(timelineUtils)
 767                 {
 768                     workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
 769                                                                                                     inferenceGuid);
 770                 }
 771                 workload->Execute();
 772                 if(timelineUtils)
 773                 {
 774                     timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
 775                 }
 776             }
 777         };
 778
 779         ExecuteQueue(m_InputQueue);
 780         ExecuteQueue(m_WorkloadQueue);
 781         ExecuteQueue(m_OutputQueue);
 782     }
 783     catch (const RuntimeException& error)
 784     {
 785         Fail(error);
 786     }
 787     catch (const std::runtime_error& error)
 788     {
 789         Fail(error);
 790     }
 791
 792     return success;
 793 }
 794
 795 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
 796 {
 797     for (auto&& workloadPtr: m_WorkloadQueue)
 798     {
 799         workloadPtr.get()->RegisterDebugCallback(func);
 800     }
 801 }
 802
 803 }