2 // Copyright (C) 2018-2019 Intel Corporation.
4 // This software and the related documents are Intel copyrighted materials,
5 // and your use of them is governed by the express license under which they
6 // were provided to you (End User License Agreement for the Intel(R) Software
7 // Development Products (Version May 2017)). Unless the License provides
8 // otherwise, you may not use, modify, copy, publish, distribute, disclose or
9 // transmit this software or the related documents without Intel's prior
10 // written permission.
12 // This software and the related documents are provided as is, with no
13 // express or implied warranties, other than those that are expressly
14 // stated in the License.
17 #include "hetero_executable_network.h"
18 #include "hetero_async_infer_request.h"
19 #include "ie_util_internal.hpp"
20 #include "hetero_device_loader.h"
25 #include <unordered_map>
29 #include <ie_plugin_dispatcher.hpp>
30 #include <ie_graph_splitter.hpp>
31 #include "fallback_policy.h"
32 #include "details/caseless.hpp"
33 #include "ie_plugin_config.hpp"
34 #include "hetero/hetero_plugin_config.hpp"
35 #include "precision_utils.h"
37 using namespace InferenceEngine;
38 using namespace details;
39 using namespace HeteroPlugin;
40 using namespace InferenceEngine::PluginConfigParams;
41 using namespace InferenceEngine::HeteroConfigParams;
44 std::vector<std::string> getAffinities(InferenceEngine::ICNNNetwork &network) {
45 std::vector<std::string> ret;
46 std::unordered_set<std::string> affinities;
47 traverse::traverse(network,
48 [&](const InferenceEngine::CNNLayerPtr &layer) {
49 assert(nullptr != layer);
50 if (!contains(affinities, layer->affinity)) {
51 affinities.insert(layer->affinity);
52 ret.push_back(layer->affinity);
58 void dumpGraph(InferenceEngine::ICNNNetwork &network,
59 const std::vector<LayersSet> &subgraphs,
60 std::ostream &stream) {
61 static const std::array<const char *, 9> colors{{
71 auto split_color = [subgraphs](const CNNLayerPtr layer,
72 ordered_properties &printed_properties,
73 ordered_properties &node_properties) {
74 for (size_t i = 0; i < subgraphs.size(); i++) {
75 for (auto s : subgraphs[i]) {
76 if (s->name == layer->name) {
77 node_properties.emplace_back(
79 colors[std::min(i, colors.size() - 1)]);
80 printed_properties.insert(printed_properties.begin(),
81 std::pair<std::string, std::string>("subgraph#", std::to_string(i)));
82 printed_properties.insert(printed_properties.begin(),
83 std::pair<std::string, std::string>("device", layer->affinity));
90 saveGraphToDot(network, stream, split_color);
95 HeteroExecutableNetwork::HeteroExecutableNetwork(InferenceEngine::ICNNNetwork &network,
96 const std::map<std::string, std::string> &config,
97 const std::vector<InferenceEngine::IExtensionPtr> &extensions,
98 MapDeviceLoaders& deviceLoaders,
99 InferenceEngine::IErrorListener *listener) :
100 _deviceLoaders(deviceLoaders) {
101 load(network, config, extensions, listener);
104 void dla_layer_colorer(const CNNLayerPtr layer,
105 ordered_properties &printed_properties,
106 ordered_properties &node_properties);
108 void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
109 const std::map<std::string, std::string> &config,
110 const std::vector<InferenceEngine::IExtensionPtr> &extensions,
111 InferenceEngine::IErrorListener *listener) {
112 auto networkPtr = cloneNet(network_);
113 auto& network = *networkPtr;
115 // going over all network, if all layers are not assigned to devices, apply the default fallback policy
116 details::CNNNetworkIterator i(&network);
117 bool allEmpty = true;
118 while (i != details::CNNNetworkIterator()) {
119 CNNLayer::Ptr layer = *i;
120 if (!layer->affinity.empty()) {
127 auto itDumpDotFile = config.find(KEY_HETERO_DUMP_GRAPH_DOT);
128 bool dumpDotFile = itDumpDotFile != config.end() ? itDumpDotFile->second == YES : false;
134 FallbackPolicy fbPolicy(_deviceLoaders, dumpDotFile);
135 auto it = config.find("TARGET_FALLBACK");
136 if (it != config.end()) {
137 fbPolicy.init(it->second, config, extensions);
139 for (auto& device_loader : _deviceLoaders)
140 device_loader.second->SetLogCallback(*listener);
141 fbPolicy.setAffinity(config, network);
143 THROW_IE_EXCEPTION << "The 'TARGET_FALLBACK' option was not defined for heterogeneous plugin";
147 std::stringstream stream(std::stringstream::out);
148 stream << "hetero_affinity_" << network.getName() << ".dot";
150 std::ofstream file(stream.str().c_str());
151 saveGraphToDot(network, file, dla_layer_colorer);
155 details::CNNNetworkIterator el(&network);
156 bool someEmptyAffinity = false;
157 CNNLayer::Ptr layerEmptyAffinity = nullptr;
158 while (el != details::CNNNetworkIterator()) {
159 CNNLayer::Ptr layer = *el;
160 if (!CaselessEq<std::string>()(layer->type, "input") &&
161 layer->affinity.empty()) {
162 someEmptyAffinity = true;
163 layerEmptyAffinity = layer;
168 if (allEmpty && someEmptyAffinity) {
169 THROW_IE_EXCEPTION << "Hetero plugin used default fallback policy, but some layers eg: \n(Name:" <<
170 layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
171 ") were not able to be assigned on any pointed device.\n" <<
172 "It happened because these layers are not supported in plugins by default.\n" <<
173 "You need to implement custom layers to support them.";
174 } else if (someEmptyAffinity) {
175 THROW_IE_EXCEPTION << "Network passed to LoadNetwork has affinity assigned, but some layers eg: \n(Name:" <<
176 layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
177 ") were not assigned to any device.\n" <<
178 "It might happen if you assigned layers amnually and missed some layers or\n" <<
179 "if you used some automatic assigning mode which decided that these layers are not\n" <<
180 "supported by any plugin";
184 InputsDataMap externalInputsData;
185 network.getInputsInfo(externalInputsData);
187 OutputsDataMap externalOutputsData;
188 network.getOutputsInfo(externalOutputsData);
190 auto subgraphs = splitGraph(network, getAffinities(network));
192 sortSubgraphs(subgraphs);
195 std::stringstream stream(std::stringstream::out);
196 stream << "hetero_subgraphs_" << network.getName() << ".dot";
198 std::ofstream file(stream.str().c_str());
199 dumpGraph(network, subgraphs, file);
202 std::vector<NetworkDesc> descs;
203 PluginDispatcher dispatcher({ "" });
204 std::vector<CNNLayerPtr> tempLayers;
206 // we need to create plugins first to use them later during selection of best precisino for intermediate blobs
207 for (auto &&subgraph : subgraphs) {
208 assert(!subgraph.empty());
209 auto affinity = (*subgraph.begin())->affinity;
210 assert(!affinity.empty());
211 if (_deviceLoaders.find(affinity) == _deviceLoaders.end()) {
212 // TODO: here is a duplication of the code with FallbackPolicy::init
213 IHeteroDeviceLoader::Ptr loader;
214 loader = std::make_shared<HeteroDeviceLoader>(affinity);
215 HeteroDeviceLoader *pdl = dynamic_cast<HeteroDeviceLoader *>(loader.get());
216 pdl->initConfigs(config, extensions);
217 _deviceLoaders[affinity] = loader;
220 _deviceLoaders[affinity]->SetLogCallback(*listener);
223 InferenceEngine::ICNNNetworkStats* networkStats = nullptr;
224 if (StatusCode::OK != network.getStats(&networkStats, nullptr)) {
225 networkStats = nullptr;
229 for (auto &&subgraph : subgraphs) {
230 auto affinity = (*subgraph.begin())->affinity;
231 tempLayers.assign(subgraph.begin(), subgraph.end());
232 auto tempNetwork = cloneNet(tempLayers, networkStats);
233 tempNetwork->setName(network.getName() + "_" + std::to_string(std::distance(subgraphs.data(), &subgraph)));
234 // restoring some outputs from original net if they are not marked as output automatically
235 // this might happen if output was set manually for origin network and
236 // it doesn't go to next subgraph
237 for (auto il : tempLayers) {
238 if (externalOutputsData.find(il->name) != externalOutputsData.end()) {
239 tempNetwork->addOutput(il->name);
243 tempNetwork->setPrecision(network.getPrecision());
245 // update of pre-processing info
246 InputsDataMap clonedInputs;
247 tempNetwork->getInputsInfo(clonedInputs);
248 for (auto &&it : externalInputsData) {
249 auto inp = clonedInputs.find(it.first);
250 if (inp != clonedInputs.end() && nullptr != inp->second) {
251 inp->second->setInputPrecision(it.second->getInputPrecision());
252 inp->second->getPreProcess() = it.second->getPreProcess();
255 // go over all inputs/outputs and right now
256 // set precision for intermediate data (not for external) to FP32
257 // later on we have to add Plugin::getPreferableInputPrecision(network) and
258 // Plugin::getPreferableOutputPrecision(network) and set precision based on this info
259 // TODO(amalyshe) add clever selectino of precision for intermediate blobs
260 for (auto &&it : clonedInputs) {
261 if (externalInputsData.find(it.first) == externalInputsData.end()) {
262 it.second->setInputPrecision(Precision::FP32);
266 OutputsDataMap tmpOutputs;
267 tempNetwork->getOutputsInfo(tmpOutputs);
268 for (auto &&o : tmpOutputs) {
269 if (externalOutputsData.find(o.first) == externalOutputsData.end()) {
270 o.second->setPrecision(Precision::FP32);
274 // Temporal solution until each plugin starts to support desirable precision
275 // Only for CPU registered device we are changing all FP16 types to FP32 and convert blobs if any
276 // TODO(amalyshe) remove this hack to preoper network.setPrecision(FP16) and feeding to CPU plugin
277 if (affinity == "CPU") {
278 tempNetwork->setPrecision(Precision::FP32);
279 details::CNNNetworkIterator itcpu(reinterpret_cast<ICNNNetwork *>(tempNetwork.get()));
280 bool allEmpty = true;
281 while (itcpu != details::CNNNetworkIterator()) {
282 CNNLayer::Ptr layer = *itcpu;
283 layer->precision = Precision::FP32;
284 // take all input and output data, set FP32 precision for them
285 for (auto o : layer->outData) {
286 if (externalInputsData.find(o->getName()) == externalInputsData.end() &&
287 externalOutputsData.find(o->getName()) == externalOutputsData.end()) {
288 o->setPrecision(Precision::FP32);
291 for (auto i : layer->insData) {
292 if (externalInputsData.find(i.lock()->getName()) == externalInputsData.end() &&
293 externalOutputsData.find(i.lock()->getName()) == externalOutputsData.end()) {
294 i.lock()->setPrecision(Precision::FP32);
298 auto convertBlobFP16toFP32 = [](Blob::Ptr blob) -> Blob::Ptr {
299 Blob::Ptr weightsBlob = make_shared_blob<float>(Precision::FP32, blob->layout(), blob->dims());
300 weightsBlob->allocate();
301 float* target = weightsBlob->buffer().as<float*>();
302 short* source = blob->buffer().as<short *>();
303 PrecisionUtils::f16tof32Arrays(target, source, blob->size(), 1.0f, 0.0f);
307 auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *>(layer.get());
310 if (wLayer->_weights && wLayer->_weights->precision() == Precision::FP16) {
311 wLayer->_weights = convertBlobFP16toFP32(wLayer->_weights);
312 } else if (wLayer->_weights && wLayer->_weights->precision() != Precision::FP32) {
313 THROW_IE_EXCEPTION << "weights for layer '" << wLayer->name << "' has unsupported precision";
315 if (wLayer->_biases && wLayer->_biases->precision() == Precision::FP16) {
316 wLayer->_biases = convertBlobFP16toFP32(wLayer->_biases);
317 } else if (wLayer->_biases && wLayer->_biases->precision() != Precision::FP32) {
318 THROW_IE_EXCEPTION << "biases for layer '" << wLayer->name << "' has unsupported precision";
321 for (auto&& blob : layer->blobs) {
322 auto&& data = blob.second;
323 if (nullptr != data) {
324 if (data->precision() == Precision::FP16) {
325 data = convertBlobFP16toFP32(data);
326 } else if (data->precision() != Precision::FP32) {
327 THROW_IE_EXCEPTION << "weights '" << blob.first << "' for layer '" << layer->name << "' has unsupported precision";
328 } // else no need to convert
336 desc._device = affinity;
337 desc._deviceLoader = _deviceLoaders[affinity];
339 desc._clonedNetwork = tempNetwork;
340 InputsDataMap inputs;
341 desc._clonedNetwork->getInputsInfo(inputs);
342 for (auto i : inputs) {
343 desc._iNames.insert(i.first);
345 OutputsDataMap outputs;
346 desc._clonedNetwork->getOutputsInfo(outputs);
347 for (auto o : outputs) {
348 desc._oNames.insert(o.first);
351 descs.emplace_back(std::move(desc));
354 for (auto &&d : descs) {
355 IExecutableNetwork::Ptr ret;
357 StatusCode status = d._deviceLoader->LoadNetwork(d._device, ret, *d._clonedNetwork, config, &resp);
359 THROW_IE_EXCEPTION << resp.msg;
361 d.network = std::make_shared<ExecutableNetwork>(ret);
362 d._clonedNetwork = nullptr;
366 networks = std::move(descs);
369 InferRequestInternal::Ptr HeteroExecutableNetwork::CreateInferRequestImpl(
370 InputsDataMap networkInputs,
371 OutputsDataMap networkOutputs) {
372 HeteroInferRequest::SubRequestsList inferRequests;
374 for (auto i : networks) {
375 HeteroInferRequest::SubRequestDesc desc;
376 desc._network = i.network;
377 desc._iNames = i._iNames;
378 desc._oNames = i._oNames;
379 desc._profilingTask = ProfilingTask{"Infer" + std::to_string(index++)};
381 inferRequests.push_back(desc);
383 return std::make_shared<HeteroInferRequest>(networkInputs,
388 void HeteroExecutableNetwork::CreateInferRequest(IInferRequest::Ptr &asyncRequest) {
389 auto heteroInferRequest = std::dynamic_pointer_cast<HeteroInferRequest>(
390 CreateInferRequestImpl(_networkInputs, _networkOutputs));
391 heteroInferRequest->setPointerToExecutableNetworkInternal(shared_from_this());
392 auto asyncTreadSafeImpl = std::make_shared<HeteroAsyncInferRequest>(
393 heteroInferRequest, _taskExecutor, _taskSynchronizer, _callbackExecutor);
394 asyncRequest.reset(new InferRequestBase<HeteroAsyncInferRequest>(asyncTreadSafeImpl),
395 [](IInferRequest *p) { p->Release(); });
396 asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);