inference-engine/src/hetero_plugin/hetero_executable_network.cpp

   1 //
   2 // Copyright (C) 2018-2019 Intel Corporation.
   3 //
   4 // This software and the related documents are Intel copyrighted materials,
   5 // and your use of them is governed by the express license under which they
   6 // were provided to you (End User License Agreement for the Intel(R) Software
   7 // Development Products (Version May 2017)). Unless the License provides
   8 // otherwise, you may not use, modify, copy, publish, distribute, disclose or
   9 // transmit this software or the related documents without Intel's prior
  10 // written permission.
  11 //
  12 // This software and the related documents are provided as is, with no
  13 // express or implied warranties, other than those that are expressly
  14 // stated in the License.
  15 //
  16
  17 #include "hetero_executable_network.h"
  18 #include "hetero_async_infer_request.h"
  19 #include "ie_util_internal.hpp"
  20 #include "hetero_device_loader.h"
  21
  22 #include <array>
  23 #include <set>
  24 #include <utility>
  25 #include <unordered_map>
  26 #include <fstream>
  27 #include <algorithm>
  28
  29 #include <ie_plugin_dispatcher.hpp>
  30 #include <ie_graph_splitter.hpp>
  31 #include "fallback_policy.h"
  32 #include "details/caseless.hpp"
  33 #include "ie_plugin_config.hpp"
  34 #include "hetero/hetero_plugin_config.hpp"
  35 #include "precision_utils.h"
  36
  37 using namespace InferenceEngine;
  38 using namespace details;
  39 using namespace HeteroPlugin;
  40 using namespace InferenceEngine::PluginConfigParams;
  41 using namespace InferenceEngine::HeteroConfigParams;
  42
  43 namespace {
  44 std::vector<std::string> getAffinities(InferenceEngine::ICNNNetwork &network) {
  45     std::vector<std::string> ret;
  46     std::unordered_set<std::string> affinities;
  47     traverse::traverse(network,
  48                        [&](const InferenceEngine::CNNLayerPtr &layer) {
  49                            assert(nullptr != layer);
  50                            if (!contains(affinities, layer->affinity)) {
  51                                affinities.insert(layer->affinity);
  52                                ret.push_back(layer->affinity);
  53                            }
  54                        });
  55     return ret;
  56 }
  57
  58 void dumpGraph(InferenceEngine::ICNNNetwork &network,
  59                const std::vector<LayersSet> &subgraphs,
  60                std::ostream &stream) {
  61     static const std::array<const char *, 9> colors{{
  62                                                             "#FFC405",
  63                                                             "#20F608",
  64                                                             "#F1F290",
  65                                                             "#C405FF",
  66                                                             "#BCFF05",
  67                                                             "#05FFC4",
  68                                                             "#FFC405",
  69                                                             "#5A5DF0",
  70                                                             "#FF2E05"}};
  71     auto split_color = [subgraphs](const CNNLayerPtr layer,
  72                                    ordered_properties &printed_properties,
  73                                    ordered_properties &node_properties) {
  74         for (size_t i = 0; i < subgraphs.size(); i++) {
  75             for (auto s : subgraphs[i]) {
  76                 if (s->name == layer->name) {
  77                     node_properties.emplace_back(
  78                             "fillcolor",
  79                             colors[std::min(i, colors.size() - 1)]);
  80                     printed_properties.insert(printed_properties.begin(),
  81                                               std::pair<std::string, std::string>("subgraph#", std::to_string(i)));
  82                     printed_properties.insert(printed_properties.begin(),
  83                                               std::pair<std::string, std::string>("device", layer->affinity));
  84                     return;
  85                 }
  86             }
  87         }
  88     };
  89
  90     saveGraphToDot(network, stream, split_color);
  91 }
  92
  93 }   // namespace
  94
  95 HeteroExecutableNetwork::HeteroExecutableNetwork(InferenceEngine::ICNNNetwork &network,
  96                                                  const std::map<std::string, std::string> &config,
  97                                                  const std::vector<InferenceEngine::IExtensionPtr> &extensions,
  98                                                  MapDeviceLoaders& deviceLoaders,
  99                                                  InferenceEngine::IErrorListener *listener) :
 100     _deviceLoaders(deviceLoaders) {
 101     load(network, config, extensions, listener);
 102 }
 103
 104 void dla_layer_colorer(const CNNLayerPtr layer,
 105                        ordered_properties &printed_properties,
 106                        ordered_properties &node_properties);
 107
 108 void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
 109                                    const std::map<std::string, std::string> &config,
 110                                    const std::vector<InferenceEngine::IExtensionPtr> &extensions,
 111                                    InferenceEngine::IErrorListener *listener) {
 112     auto networkPtr = cloneNet(network_);
 113     auto& network = *networkPtr;
 114
 115     // going over all network, if all layers are not assigned to devices, apply the default fallback policy
 116     details::CNNNetworkIterator i(&network);
 117     bool allEmpty = true;
 118     while (i != details::CNNNetworkIterator()) {
 119         CNNLayer::Ptr layer = *i;
 120         if (!layer->affinity.empty()) {
 121             allEmpty = false;
 122             break;
 123         }
 124         i++;
 125     }
 126
 127     auto itDumpDotFile = config.find(KEY_HETERO_DUMP_GRAPH_DOT);
 128     bool dumpDotFile = itDumpDotFile != config.end() ? itDumpDotFile->second == YES : false;
 129 #ifndef NDEBUG
 130     dumpDotFile  = true;
 131 #endif
 132
 133     if (allEmpty) {
 134         FallbackPolicy fbPolicy(_deviceLoaders, dumpDotFile);
 135         auto it = config.find("TARGET_FALLBACK");
 136         if (it != config.end()) {
 137             fbPolicy.init(it->second, config, extensions);
 138             if (listener)
 139                 for (auto& device_loader : _deviceLoaders)
 140                     device_loader.second->SetLogCallback(*listener);
 141             fbPolicy.setAffinity(config, network);
 142         } else {
 143             THROW_IE_EXCEPTION << "The 'TARGET_FALLBACK' option was not defined for heterogeneous plugin";
 144         }
 145     } else {
 146         if (dumpDotFile) {
 147             std::stringstream stream(std::stringstream::out);
 148             stream << "hetero_affinity_" << network.getName() << ".dot";
 149
 150             std::ofstream file(stream.str().c_str());
 151             saveGraphToDot(network, file, dla_layer_colorer);
 152         }
 153     }
 154
 155     details::CNNNetworkIterator el(&network);
 156     bool someEmptyAffinity = false;
 157     CNNLayer::Ptr layerEmptyAffinity = nullptr;
 158     while (el != details::CNNNetworkIterator()) {
 159         CNNLayer::Ptr layer = *el;
 160         if (!CaselessEq<std::string>()(layer->type, "input") &&
 161             layer->affinity.empty()) {
 162             someEmptyAffinity = true;
 163             layerEmptyAffinity = layer;
 164         }
 165         el++;
 166     }
 167
 168     if (allEmpty && someEmptyAffinity) {
 169         THROW_IE_EXCEPTION << "Hetero plugin used default fallback policy, but some layers eg: \n(Name:" <<
 170             layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
 171             ") were not able to be assigned on any pointed device.\n" <<
 172             "It happened because these layers are not supported in plugins by default.\n" <<
 173             "You need to implement custom layers to support them.";
 174     } else if (someEmptyAffinity) {
 175         THROW_IE_EXCEPTION << "Network passed to LoadNetwork has affinity assigned, but some layers eg: \n(Name:" <<
 176             layerEmptyAffinity->name << ", Type: " << layerEmptyAffinity->type <<
 177             ") were not assigned to any device.\n" <<
 178             "It might happen if you assigned layers amnually and missed some layers or\n" <<
 179             "if you used some automatic assigning mode which decided that these layers are not\n" <<
 180             "supported by any plugin";
 181     }
 182
 183
 184     InputsDataMap externalInputsData;
 185     network.getInputsInfo(externalInputsData);
 186
 187     OutputsDataMap externalOutputsData;
 188     network.getOutputsInfo(externalOutputsData);
 189
 190     auto subgraphs = splitGraph(network, getAffinities(network));
 191
 192     sortSubgraphs(subgraphs);
 193
 194     if (dumpDotFile) {
 195         std::stringstream stream(std::stringstream::out);
 196         stream << "hetero_subgraphs_" << network.getName() << ".dot";
 197
 198         std::ofstream file(stream.str().c_str());
 199         dumpGraph(network, subgraphs, file);
 200     }
 201
 202     std::vector<NetworkDesc> descs;
 203     PluginDispatcher dispatcher({ "" });
 204     std::vector<CNNLayerPtr> tempLayers;
 205
 206     // we need to create plugins first to use them later during selection of best precisino for intermediate blobs
 207     for (auto &&subgraph : subgraphs) {
 208         assert(!subgraph.empty());
 209         auto affinity = (*subgraph.begin())->affinity;
 210         assert(!affinity.empty());
 211         if (_deviceLoaders.find(affinity) == _deviceLoaders.end()) {
 212             // TODO: here is a duplication of the code with FallbackPolicy::init
 213             IHeteroDeviceLoader::Ptr loader;
 214             loader = std::make_shared<HeteroDeviceLoader>(affinity);
 215             HeteroDeviceLoader *pdl = dynamic_cast<HeteroDeviceLoader *>(loader.get());
 216             pdl->initConfigs(config, extensions);
 217             _deviceLoaders[affinity] = loader;
 218         }
 219         if (listener)
 220             _deviceLoaders[affinity]->SetLogCallback(*listener);
 221     }
 222
 223     InferenceEngine::ICNNNetworkStats* networkStats = nullptr;
 224     if (StatusCode::OK != network.getStats(&networkStats, nullptr)) {
 225         networkStats = nullptr;
 226     }
 227
 228
 229     for (auto &&subgraph : subgraphs) {
 230         auto affinity = (*subgraph.begin())->affinity;
 231         tempLayers.assign(subgraph.begin(), subgraph.end());
 232         auto tempNetwork = cloneNet(tempLayers, networkStats);
 233         tempNetwork->setName(network.getName() + "_" + std::to_string(std::distance(subgraphs.data(), &subgraph)));
 234         // restoring some outputs from original net if they are not marked as output automatically
 235         // this might happen if output was set manually for origin network and
 236         // it doesn't go to next subgraph
 237         for (auto il : tempLayers) {
 238             if (externalOutputsData.find(il->name) != externalOutputsData.end()) {
 239                 tempNetwork->addOutput(il->name);
 240             }
 241         }
 242
 243         tempNetwork->setPrecision(network.getPrecision());
 244
 245         // update of pre-processing info
 246         InputsDataMap clonedInputs;
 247         tempNetwork->getInputsInfo(clonedInputs);
 248         for (auto &&it : externalInputsData) {
 249             auto inp = clonedInputs.find(it.first);
 250             if (inp != clonedInputs.end() && nullptr != inp->second) {
 251                 inp->second->setInputPrecision(it.second->getInputPrecision());
 252                 inp->second->getPreProcess() = it.second->getPreProcess();
 253             }
 254         }
 255         // go over all inputs/outputs and right now
 256         // set precision for intermediate data (not for external) to FP32
 257         // later on we have to add Plugin::getPreferableInputPrecision(network) and
 258         // Plugin::getPreferableOutputPrecision(network) and set precision based on this info
 259         // TODO(amalyshe) add clever selectino of precision for intermediate blobs
 260         for (auto &&it : clonedInputs) {
 261             if (externalInputsData.find(it.first) == externalInputsData.end()) {
 262                 it.second->setInputPrecision(Precision::FP32);
 263             }
 264         }
 265
 266         OutputsDataMap tmpOutputs;
 267         tempNetwork->getOutputsInfo(tmpOutputs);
 268         for (auto &&o : tmpOutputs) {
 269             if (externalOutputsData.find(o.first) == externalOutputsData.end()) {
 270                 o.second->setPrecision(Precision::FP32);
 271             }
 272         }
 273
 274         // Temporal solution until each plugin starts to support desirable precision
 275         // Only for CPU registered device we are changing all FP16 types to FP32 and convert blobs if any
 276         // TODO(amalyshe) remove this hack to preoper network.setPrecision(FP16) and feeding to CPU plugin
 277         if (affinity == "CPU") {
 278             tempNetwork->setPrecision(Precision::FP32);
 279             details::CNNNetworkIterator itcpu(reinterpret_cast<ICNNNetwork *>(tempNetwork.get()));
 280             bool allEmpty = true;
 281             while (itcpu != details::CNNNetworkIterator()) {
 282                 CNNLayer::Ptr layer = *itcpu;
 283                 layer->precision = Precision::FP32;
 284                 // take all input and output data, set FP32 precision for them
 285                 for (auto o : layer->outData) {
 286                     if (externalInputsData.find(o->getName()) == externalInputsData.end() &&
 287                         externalOutputsData.find(o->getName()) == externalOutputsData.end()) {
 288                         o->setPrecision(Precision::FP32);
 289                     }
 290                 }
 291                 for (auto i : layer->insData) {
 292                     if (externalInputsData.find(i.lock()->getName()) == externalInputsData.end() &&
 293                         externalOutputsData.find(i.lock()->getName()) == externalOutputsData.end()) {
 294                         i.lock()->setPrecision(Precision::FP32);
 295                     }
 296                 }
 297
 298                 auto convertBlobFP16toFP32 = [](Blob::Ptr blob) -> Blob::Ptr {
 299                     Blob::Ptr weightsBlob = make_shared_blob<float>(Precision::FP32, blob->layout(), blob->dims());
 300                     weightsBlob->allocate();
 301                     float* target = weightsBlob->buffer().as<float*>();
 302                     short* source = blob->buffer().as<short *>();
 303                     PrecisionUtils::f16tof32Arrays(target, source, blob->size(), 1.0f, 0.0f);
 304                     return weightsBlob;
 305                 };
 306                 // convert blobs
 307                 auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *>(layer.get());
 308                 if (wLayer) {
 309                     // verify
 310                     if (wLayer->_weights && wLayer->_weights->precision() == Precision::FP16) {
 311                         wLayer->_weights = convertBlobFP16toFP32(wLayer->_weights);
 312                     } else if (wLayer->_weights && wLayer->_weights->precision() != Precision::FP32) {
 313                         THROW_IE_EXCEPTION << "weights for layer '" << wLayer->name << "' has unsupported precision";
 314                     }
 315                     if (wLayer->_biases && wLayer->_biases->precision() == Precision::FP16) {
 316                         wLayer->_biases = convertBlobFP16toFP32(wLayer->_biases);
 317                     } else if (wLayer->_biases && wLayer->_biases->precision() != Precision::FP32) {
 318                         THROW_IE_EXCEPTION << "biases for layer '" << wLayer->name << "' has unsupported precision";
 319                     }
 320                 }
 321                 for (auto&& blob : layer->blobs) {
 322                     auto&& data = blob.second;
 323                     if (nullptr != data) {
 324                         if (data->precision() == Precision::FP16) {
 325                             data = convertBlobFP16toFP32(data);
 326                         } else if (data->precision() != Precision::FP32) {
 327                             THROW_IE_EXCEPTION << "weights '" << blob.first << "' for layer '" << layer->name << "' has unsupported precision";
 328                         }  // else no need to convert
 329                     }
 330                 }
 331                 itcpu++;
 332             }
 333         }
 334
 335         NetworkDesc desc;
 336         desc._device = affinity;
 337         desc._deviceLoader = _deviceLoaders[affinity];
 338
 339         desc._clonedNetwork = tempNetwork;
 340         InputsDataMap inputs;
 341         desc._clonedNetwork->getInputsInfo(inputs);
 342         for (auto i : inputs) {
 343             desc._iNames.insert(i.first);
 344         }
 345         OutputsDataMap outputs;
 346         desc._clonedNetwork->getOutputsInfo(outputs);
 347         for (auto o : outputs) {
 348             desc._oNames.insert(o.first);
 349         }
 350
 351         descs.emplace_back(std::move(desc));
 352     }
 353
 354     for (auto &&d : descs) {
 355         IExecutableNetwork::Ptr ret;
 356         ResponseDesc resp;
 357         StatusCode status = d._deviceLoader->LoadNetwork(d._device, ret, *d._clonedNetwork, config, &resp);
 358         if (status != OK) {
 359             THROW_IE_EXCEPTION << resp.msg;
 360         }
 361         d.network = std::make_shared<ExecutableNetwork>(ret);
 362         d._clonedNetwork = nullptr;
 363     }
 364
 365
 366     networks = std::move(descs);
 367 }
 368
 369 InferRequestInternal::Ptr HeteroExecutableNetwork::CreateInferRequestImpl(
 370         InputsDataMap networkInputs,
 371         OutputsDataMap networkOutputs) {
 372     HeteroInferRequest::SubRequestsList inferRequests;
 373     int index = 0;
 374     for (auto i : networks) {
 375         HeteroInferRequest::SubRequestDesc desc;
 376         desc._network = i.network;
 377         desc._iNames = i._iNames;
 378         desc._oNames = i._oNames;
 379         desc._profilingTask = ProfilingTask{"Infer" + std::to_string(index++)};
 380
 381         inferRequests.push_back(desc);
 382     }
 383     return std::make_shared<HeteroInferRequest>(networkInputs,
 384                                                 networkOutputs,
 385                                                 inferRequests);
 386 }
 387
 388 void HeteroExecutableNetwork::CreateInferRequest(IInferRequest::Ptr &asyncRequest) {
 389     auto heteroInferRequest = std::dynamic_pointer_cast<HeteroInferRequest>(
 390             CreateInferRequestImpl(_networkInputs, _networkOutputs));
 391     heteroInferRequest->setPointerToExecutableNetworkInternal(shared_from_this());
 392     auto asyncTreadSafeImpl = std::make_shared<HeteroAsyncInferRequest>(
 393             heteroInferRequest, _taskExecutor, _taskSynchronizer, _callbackExecutor);
 394     asyncRequest.reset(new InferRequestBase<HeteroAsyncInferRequest>(asyncTreadSafeImpl),
 395                        [](IInferRequest *p) { p->Release(); });
 396     asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
 397 }