Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / hetero_plugin / hetero_executable_network.cpp
1 //
2 // Copyright (C) 2018-2019 Intel Corporation.
3 //
4 // This software and the related documents are Intel copyrighted materials,
5 // and your use of them is governed by the express license under which they
6 // were provided to you (End User License Agreement for the Intel(R) Software
7 // Development Products (Version May 2017)). Unless the License provides
8 // otherwise, you may not use, modify, copy, publish, distribute, disclose or
9 // transmit this software or the related documents without Intel's prior
10 // written permission.
11 //
12 // This software and the related documents are provided as is, with no
13 // express or implied warranties, other than those that are expressly
14 // stated in the License.
15 //
16
17 #include "hetero_executable_network.h"
18 #include "hetero_async_infer_request.h"
19 #include "ie_util_internal.hpp"
20 #include "hetero_device_loader.h"
21
22 #include <array>
23 #include <set>
24 #include <utility>
25 #include <unordered_map>
26 #include <fstream>
27 #include <algorithm>
28
29 #include <ie_plugin_dispatcher.hpp>
30 #include <ie_graph_splitter.hpp>
31 #include "fallback_policy.h"
32 #include "details/caseless.hpp"
33 #include "ie_plugin_config.hpp"
34 #include "hetero/hetero_plugin_config.hpp"
35 #include "precision_utils.h"
36
37 using namespace InferenceEngine;
38 using namespace details;
39 using namespace HeteroPlugin;
40 using namespace InferenceEngine::PluginConfigParams;
41 using namespace InferenceEngine::HeteroConfigParams;
42
43 namespace {
44 std::vector<std::string> getAffinities(InferenceEngine::ICNNNetwork &network) {
45     std::vector<std::string> ret;
46     std::unordered_set<std::string> affinities;
47     traverse::traverse(network,
48                        [&](const InferenceEngine::CNNLayerPtr &layer) {
49                            assert(nullptr != layer);
50                            if (!contains(affinities, layer->affinity)) {
51                                affinities.insert(layer->affinity);
52                                ret.push_back(layer->affinity);
53                            }
54                        });
55     return ret;
56 }
57
58 void dumpGraph(InferenceEngine::ICNNNetwork &network,
59                const std::vector<LayersSet> &subgraphs,
60                std::ostream &stream) {
61     static const std::array<const char *, 9> colors{{
62                                                             "#FFC405",
63                                                             "#20F608",
64                                                             "#F1F290",
65                                                             "#C405FF",
66                                                             "#BCFF05",
67                                                             "#05FFC4",
68                                                             "#FFC405",
69                                                             "#5A5DF0",
70                                                             "#FF2E05"}};
71     auto split_color = [subgraphs](const CNNLayerPtr layer,
72                                    ordered_properties &printed_properties,
73                                    ordered_properties &node_properties) {
74         for (size_t i = 0; i < subgraphs.size(); i++) {
75             for (auto s : subgraphs[i]) {
76                 if (s->name == layer->name) {
77                     node_properties.emplace_back(
78                             "fillcolor",
79                             colors[std::min(i, colors.size() - 1)]);
80                     printed_properties.insert(printed_properties.begin(),
81                                               std::pair<std::string, std::string>("subgraph#", std::to_string(i)));
82                     printed_properties.insert(printed_properties.begin(),
83                                               std::pair<std::string, std::string>("device", layer->affinity));
84                     return;
85                 }
86             }
87         }
88     };
89
90     saveGraphToDot(network, stream, split_color);
91 }
92
93 }   // namespace
94
95 HeteroExecutableNetwork::HeteroExecutableNetwork(InferenceEngine::ICNNNetwork &network,
96                                                  const std::map<std::string, std::string> &config,
97                                                  const std::vector<InferenceEngine::IExtensionPtr> &extensions,
98                                                  MapDeviceLoaders& deviceLoaders,
99                                                  InferenceEngine::IErrorListener *listener) :
100     _deviceLoaders(deviceLoaders) {
101     load(network, config, extensions, listener);
102 }
103
104 void dla_layer_colorer(const CNNLayerPtr layer,
105                        ordered_properties &printed_properties,
106                        ordered_properties &node_properties);
107
108 void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
109                                    const std::map<std::string, std::string> &config,
110                                    const std::vector<InferenceEngine::IExtensionPtr> &extensions,
111                                    InferenceEngine::IErrorListener *listener) {
112     auto networkPtr = cloneNet(network_);
113     auto& network = *networkPtr;
114
115     // going over all network, if all layers are not assigned to devices, apply the default fallback policy
116     details::CNNNetworkIterator i(&network);
117     bool allEmpty = true;
118     while (i != details::CNNNetworkIterator()) {
119         CNNLayer::Ptr layer = *i;
120         if (!layer->affinity.empty()) {
121             allEmpty = false;
122             break;
123         }
124         i++;
125     }
126
127     auto itDumpDotFile = config.find(KEY_HETERO_DUMP_GRAPH_DOT);
128     bool dumpDotFile = itDumpDotFile != config.end() ? itDumpDotFile->second == YES : false;
129 #ifndef NDEBUG
130     dumpDotFile  = true;
131 #endif
132
133     if (allEmpty) {
134         FallbackPolicy fbPolicy(_deviceLoaders, dumpDotFile);
135         auto it = config.find("TARGET_FALLBACK");
136         if (it != config.end()) {
137             fbPolicy.init(it->second, config, extensions);
138             if (listener)
139                 for (auto& device_loader : _deviceLoaders)
140                     device_loader.second->SetLogCallback(*listener);
141             fbPolicy.setAffinity(config, network);
142         } else {
143             THROW_IE_EXCEPTION << "The 'TARGET_FALLBACK' option was not defined for heterogeneous plugin";
144         }
145     } else {
146         if (dumpDotFile) {
147             std::stringstream stream(std::stringstream::out);
148             stream << "hetero_affinity_" << network.getName() << ".dot";
149
150             std::ofstream file(stream.str().c_str());
151             saveGraphToDot(network, file, dla_layer_colorer);
152         }
153     }
154
155     details::CNNNetworkIterator el(&network);
156     bool someEmptyAffinity = false;
157     CNNLayer::Ptr layerEmptyAffinity = nullptr;
158     while (el != details::CNNNetworkIterator()) {
159         CNNLayer::Ptr layer = *el;
160         if (!CaselessEq<std::string>()(layer->type, "input") &&
161             layer->affinity.empty()) {
162             someEmptyAffinity = true;
163             layerEmptyAffinity = layer;
164         }
165         el++;
166     }
167
168     if (allEmpty && someEmptyAffinity) {
169         THROW_IE_EXCEPTION << "Hetero plugin used default fallback policy, but some layers eg: \n(Name:" <<
170             layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
171             ") were not able to be assigned on any pointed device.\n" <<
172             "It happened because these layers are not supported in plugins by default.\n" <<
173             "You need to implement custom layers to support them.";
174     } else if (someEmptyAffinity) {
175         THROW_IE_EXCEPTION << "Network passed to LoadNetwork has affinity assigned, but some layers eg: \n(Name:" <<
176             layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
177             ") were not assigned to any device.\n" <<
178             "It might happen if you assigned layers amnually and missed some layers or\n" <<
179             "if you used some automatic assigning mode which decided that these layers are not\n" <<
180             "supported by any plugin";
181     }
182
183
184     InputsDataMap externalInputsData;
185     network.getInputsInfo(externalInputsData);
186
187     OutputsDataMap externalOutputsData;
188     network.getOutputsInfo(externalOutputsData);
189
190     auto subgraphs = splitGraph(network, getAffinities(network));
191
192     sortSubgraphs(subgraphs);
193
194     if (dumpDotFile) {
195         std::stringstream stream(std::stringstream::out);
196         stream << "hetero_subgraphs_" << network.getName() << ".dot";
197
198         std::ofstream file(stream.str().c_str());
199         dumpGraph(network, subgraphs, file);
200     }
201
202     std::vector<NetworkDesc> descs;
203     PluginDispatcher dispatcher({ "" });
204     std::vector<CNNLayerPtr> tempLayers;
205
206     // we need to create plugins first to use them later during selection of best precisino for intermediate blobs
207     for (auto &&subgraph : subgraphs) {
208         assert(!subgraph.empty());
209         auto affinity = (*subgraph.begin())->affinity;
210         assert(!affinity.empty());
211         if (_deviceLoaders.find(affinity) == _deviceLoaders.end()) {
212             // TODO: here is a duplication of the code with FallbackPolicy::init
213             IHeteroDeviceLoader::Ptr loader;
214             loader = std::make_shared<HeteroDeviceLoader>(affinity);
215             HeteroDeviceLoader *pdl = dynamic_cast<HeteroDeviceLoader *>(loader.get());
216             pdl->initConfigs(config, extensions);
217             _deviceLoaders[affinity] = loader;
218         }
219         if (listener)
220             _deviceLoaders[affinity]->SetLogCallback(*listener);
221     }
222
223     InferenceEngine::ICNNNetworkStats* networkStats = nullptr;
224     if (StatusCode::OK != network.getStats(&networkStats, nullptr)) {
225         networkStats = nullptr;
226     }
227
228
229     for (auto &&subgraph : subgraphs) {
230         auto affinity = (*subgraph.begin())->affinity;
231         tempLayers.assign(subgraph.begin(), subgraph.end());
232         auto tempNetwork = cloneNet(tempLayers, networkStats);
233         tempNetwork->setName(network.getName() + "_" + std::to_string(std::distance(subgraphs.data(), &subgraph)));
234         // restoring some outputs from original net if they are not marked as output automatically
235         // this might happen if output was set manually for origin network and
236         // it doesn't go to next subgraph
237         for (auto il : tempLayers) {
238             if (externalOutputsData.find(il->name) != externalOutputsData.end()) {
239                 tempNetwork->addOutput(il->name);
240             }
241         }
242
243         tempNetwork->setPrecision(network.getPrecision());
244
245         // update of pre-processing info
246         InputsDataMap clonedInputs;
247         tempNetwork->getInputsInfo(clonedInputs);
248         for (auto &&it : externalInputsData) {
249             auto inp = clonedInputs.find(it.first);
250             if (inp != clonedInputs.end() && nullptr != inp->second) {
251                 inp->second->setInputPrecision(it.second->getInputPrecision());
252                 inp->second->getPreProcess() = it.second->getPreProcess();
253             }
254         }
255         // go over all inputs/outputs and right now
256         // set precision for intermediate data (not for external) to FP32
257         // later on we have to add Plugin::getPreferableInputPrecision(network) and
258         // Plugin::getPreferableOutputPrecision(network) and set precision based on this info
259         // TODO(amalyshe) add clever selectino of precision for intermediate blobs
260         for (auto &&it : clonedInputs) {
261             if (externalInputsData.find(it.first) == externalInputsData.end()) {
262                 it.second->setInputPrecision(Precision::FP32);
263             }
264         }
265
266         OutputsDataMap tmpOutputs;
267         tempNetwork->getOutputsInfo(tmpOutputs);
268         for (auto &&o : tmpOutputs) {
269             if (externalOutputsData.find(o.first) == externalOutputsData.end()) {
270                 o.second->setPrecision(Precision::FP32);
271             }
272         }
273
274         // Temporal solution until each plugin starts to support desirable precision
275         // Only for CPU registered device we are changing all FP16 types to FP32 and convert blobs if any
276         // TODO(amalyshe) remove this hack to preoper network.setPrecision(FP16) and feeding to CPU plugin
277         if (affinity == "CPU") {
278             tempNetwork->setPrecision(Precision::FP32);
279             details::CNNNetworkIterator itcpu(reinterpret_cast<ICNNNetwork *>(tempNetwork.get()));
280             bool allEmpty = true;
281             while (itcpu != details::CNNNetworkIterator()) {
282                 CNNLayer::Ptr layer = *itcpu;
283                 layer->precision = Precision::FP32;
284                 // take all input and output data, set FP32 precision for them
285                 for (auto o : layer->outData) {
286                     if (externalInputsData.find(o->getName()) == externalInputsData.end() &&
287                         externalOutputsData.find(o->getName()) == externalOutputsData.end()) {
288                         o->setPrecision(Precision::FP32);
289                     }
290                 }
291                 for (auto i : layer->insData) {
292                     if (externalInputsData.find(i.lock()->getName()) == externalInputsData.end() &&
293                         externalOutputsData.find(i.lock()->getName()) == externalOutputsData.end()) {
294                         i.lock()->setPrecision(Precision::FP32);
295                     }
296                 }
297
298                 auto convertBlobFP16toFP32 = [](Blob::Ptr blob) -> Blob::Ptr {
299                     Blob::Ptr weightsBlob = make_shared_blob<float>(Precision::FP32, blob->layout(), blob->dims());
300                     weightsBlob->allocate();
301                     float* target = weightsBlob->buffer().as<float*>();
302                     short* source = blob->buffer().as<short *>();
303                     PrecisionUtils::f16tof32Arrays(target, source, blob->size(), 1.0f, 0.0f);
304                     return weightsBlob;
305                 };
306                 // convert blobs
307                 auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *>(layer.get());
308                 if (wLayer) {
309                     // verify
310                     if (wLayer->_weights && wLayer->_weights->precision() == Precision::FP16) {
311                         wLayer->_weights = convertBlobFP16toFP32(wLayer->_weights);
312                     } else if (wLayer->_weights && wLayer->_weights->precision() != Precision::FP32) {
313                         THROW_IE_EXCEPTION << "weights for layer '" << wLayer->name << "' has unsupported precision";
314                     }
315                     if (wLayer->_biases && wLayer->_biases->precision() == Precision::FP16) {
316                         wLayer->_biases = convertBlobFP16toFP32(wLayer->_biases);
317                     } else if (wLayer->_biases && wLayer->_biases->precision() != Precision::FP32) {
318                         THROW_IE_EXCEPTION << "biases for layer '" << wLayer->name << "' has unsupported precision";
319                     }
320                 }
321                 for (auto&& blob : layer->blobs) {
322                     auto&& data = blob.second;
323                     if (nullptr != data) {
324                         if (data->precision() == Precision::FP16) {
325                             data = convertBlobFP16toFP32(data);
326                         } else if (data->precision() != Precision::FP32) {
327                             THROW_IE_EXCEPTION << "weights '" << blob.first << "' for layer '" << layer->name << "' has unsupported precision";
328                         }  // else no need to convert
329                     }
330                 }
331                 itcpu++;
332             }
333         }
334
335         NetworkDesc desc;
336         desc._device = affinity;
337         desc._deviceLoader = _deviceLoaders[affinity];
338
339         desc._clonedNetwork = tempNetwork;
340         InputsDataMap inputs;
341         desc._clonedNetwork->getInputsInfo(inputs);
342         for (auto i : inputs) {
343             desc._iNames.insert(i.first);
344         }
345         OutputsDataMap outputs;
346         desc._clonedNetwork->getOutputsInfo(outputs);
347         for (auto o : outputs) {
348             desc._oNames.insert(o.first);
349         }
350
351         descs.emplace_back(std::move(desc));
352     }
353
354     for (auto &&d : descs) {
355         IExecutableNetwork::Ptr ret;
356         ResponseDesc resp;
357         StatusCode status = d._deviceLoader->LoadNetwork(d._device, ret, *d._clonedNetwork, config, &resp);
358         if (status != OK) {
359             THROW_IE_EXCEPTION << resp.msg;
360         }
361         d.network = std::make_shared<ExecutableNetwork>(ret);
362         d._clonedNetwork = nullptr;
363     }
364
365
366     networks = std::move(descs);
367 }
368
369 InferRequestInternal::Ptr HeteroExecutableNetwork::CreateInferRequestImpl(
370         InputsDataMap networkInputs,
371         OutputsDataMap networkOutputs) {
372     HeteroInferRequest::SubRequestsList inferRequests;
373     int index = 0;
374     for (auto i : networks) {
375         HeteroInferRequest::SubRequestDesc desc;
376         desc._network = i.network;
377         desc._iNames = i._iNames;
378         desc._oNames = i._oNames;
379         desc._profilingTask = ProfilingTask{"Infer" + std::to_string(index++)};
380
381         inferRequests.push_back(desc);
382     }
383     return std::make_shared<HeteroInferRequest>(networkInputs,
384                                                 networkOutputs,
385                                                 inferRequests);
386 }
387
388 void HeteroExecutableNetwork::CreateInferRequest(IInferRequest::Ptr &asyncRequest) {
389     auto heteroInferRequest = std::dynamic_pointer_cast<HeteroInferRequest>(
390             CreateInferRequestImpl(_networkInputs, _networkOutputs));
391     heteroInferRequest->setPointerToExecutableNetworkInternal(shared_from_this());
392     auto asyncTreadSafeImpl = std::make_shared<HeteroAsyncInferRequest>(
393             heteroInferRequest, _taskExecutor, _taskSynchronizer, _callbackExecutor);
394     asyncRequest.reset(new InferRequestBase<HeteroAsyncInferRequest>(asyncTreadSafeImpl),
395                        [](IInferRequest *p) { p->Release(); });
396     asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
397 }