LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net)
: m_CpuRef()
, m_OptimizedNetwork(std::move(net))
+ , m_WorkingMemLock(m_WorkingMemMutex, std::defer_lock)
{
// Create a profiler and register it for the current thread.
m_Profiler = std::make_shared<Profiler>();
}
// For each input to the network, call EnqueueInput with the data passed by the user.
+ m_InputQueue.clear();
+ m_InputQueue.reserve(graph.GetNumInputs());
for (const BindableLayer* inputLayer : graph.GetInputLayers())
{
const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
}
// For each output to the network, call EnqueueOutput with the data passed by the user.
+ m_OutputQueue.clear();
+ m_OutputQueue.reserve(graph.GetNumOutputs());
for (const BindableLayer* outputLayer : graph.GetOutputLayers())
{
const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
executionSucceeded = Execute();
}
- // Hack: get rid of inputs and outputs we added.
- TidyWorkloadQueue(graph.GetNumInputs(), graph.GetNumOutputs());
-
return executionSucceeded ? Status::Success : Status::Failure;
}
const IWorkloadFactory& workloadFactory = GetWorkloadFactory(layer);
auto inputWorkload = workloadFactory.CreateInput(inputQueueDescriptor, info);
BOOST_ASSERT_MSG(inputWorkload, "No input workload created");
- m_WorkloadQueue.insert(m_WorkloadQueue.begin(), move(inputWorkload));
+ m_InputQueue.push_back(move(inputWorkload));
}
void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
const IWorkloadFactory& workloadFactory = GetWorkloadFactory(layer);
auto outputWorkload = workloadFactory.CreateOutput(outputQueueDescriptor, info);
BOOST_ASSERT_MSG(outputWorkload, "No output workload created");
- m_WorkloadQueue.push_back(move(outputWorkload));
+ m_OutputQueue.push_back(move(outputWorkload));
}
-bool LoadedNetwork::Execute()
+void LoadedNetwork::AllocateWorkingMemory()
{
- bool success = true;
-
+ BOOST_ASSERT_MSG(m_WorkingMemLock.owns_lock(), "Cannot allocate working memory if mutex is not already locked.");
+ if (m_IsWorkingMemAllocated)
+ {
+ return;
+ }
m_CpuRef.Acquire();
m_CpuAcc.Acquire();
m_GpuAcc.Acquire();
+ m_IsWorkingMemAllocated = true;
+}
+
+void LoadedNetwork::FreeWorkingMemory()
+{
+ std::lock_guard<UniqueMutexLock> lockGuard(m_WorkingMemLock);
+ if (!m_IsWorkingMemAllocated)
+ {
+ return;
+ }
+ // Informs the memory managers to release memory in it's respective memory group
+ m_CpuRef.Release();
+ m_CpuAcc.Release();
+ m_GpuAcc.Release();
+ m_IsWorkingMemAllocated = false;
+}
+
+bool LoadedNetwork::Execute()
+{
+ bool success = true;
auto Fail = [&](const std::exception& error)
{
try
{
- for (size_t i = 0; i < m_WorkloadQueue.size(); ++i)
+ std::lock_guard<UniqueMutexLock> lockGuard(m_WorkingMemLock);
+ AllocateWorkingMemory();
+
+ for (auto& input : m_InputQueue)
+ {
+ input->Execute();
+ }
+
+ for (auto& workload : m_WorkloadQueue)
+ {
+ workload->Execute();
+ }
+
+ for (auto& output: m_OutputQueue)
{
- m_WorkloadQueue[i]->Execute();
+ output->Execute();
}
}
catch (const RuntimeException& error)
Fail(error);
}
- // Informs the memory managers to release memory in it's respective memory group
- m_CpuRef.Release();
- m_CpuAcc.Release();
- m_GpuAcc.Release();
-
return success;
}
-void LoadedNetwork::TidyWorkloadQueue(size_t numInputs, size_t numOutputs)
-{
- m_WorkloadQueue.erase(m_WorkloadQueue.begin(), m_WorkloadQueue.begin() + boost::numeric_cast<long>(numInputs));
- m_WorkloadQueue.erase(m_WorkloadQueue.end() - boost::numeric_cast<long>(numOutputs), m_WorkloadQueue.end());
-}
-
}
#include <backends/Workload.hpp>
#include <backends/WorkloadFactory.hpp>
+#include <mutex>
+
namespace cl
{
class Context;
class LoadedNetwork
{
public:
+ using WorkloadQueue = std::vector< std::unique_ptr<IWorkload> >;
+ ~LoadedNetwork(){ FreeWorkingMemory(); }
+
TensorInfo GetInputTensorInfo(LayerBindingId layerId) const;
TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const;
// the shared_ptr's reference counter
const std::shared_ptr<Profiler>& GetProfiler() const { return m_Profiler; }
+ void AllocateWorkingMemory();
+ void FreeWorkingMemory();
+
private:
LoadedNetwork(std::unique_ptr<OptimizedNetwork> net);
bool Execute();
- void TidyWorkloadQueue(size_t numInputs, size_t numOutputs);
-
const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const;
RefWorkloadFactory m_CpuRef;
ClWorkloadFactory m_GpuAcc;
std::unique_ptr<OptimizedNetwork> m_OptimizedNetwork;
- std::vector< std::unique_ptr<IWorkload> > m_WorkloadQueue;
+ WorkloadQueue m_InputQueue;
+ WorkloadQueue m_WorkloadQueue;
+ WorkloadQueue m_OutputQueue;
std::shared_ptr<Profiler> m_Profiler;
+
+ using UniqueMutexLock = std::unique_lock<std::mutex>;
+ mutable std::mutex m_WorkingMemMutex;
+ UniqueMutexLock m_WorkingMemLock;
+
+ bool m_IsWorkingMemAllocated=false;
};
}
return GetLoadedNetworkPtr(networkId)->GetOutputTensorInfo(layerId);
}
+
Status Runtime::EnqueueWorkload(NetworkId networkId,
const InputTensors& inputTensors,
const OutputTensors& outputTensors)
{
LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
+
+ static thread_local NetworkId lastId = networkId;
+ if (lastId != networkId)
+ {
+ LoadedNetworkFuncSafe(lastId, [](LoadedNetwork* network)
+ {
+ network->FreeWorkingMemory();
+ });
+ }
+ lastId=networkId;
+
return loadedNetwork->EnqueueWorkload(inputTensors, outputTensors);
}