inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <algorithm>
   6 #include <string>
   7 #include <map>
   8 #include <vector>
   9 #include <unordered_set>
  10 #include <limits>
  11 #include <fstream>
  12 #include <unordered_map>
  13 #include <memory>
  14 #include "details/caseless.hpp"
  15
  16 #include "mkldnn_graph.h"
  17 #include "mkldnn_graph_optimizer.h"
  18 #include <debug.h>
  19 #include <nodes/mkldnn_input_node.h>
  20 #include <nodes/mkldnn_reorder_node.h>
  21 #include <nodes/mkldnn_depthwise_node.h>
  22 #include <nodes/mkldnn_conv_node.h>
  23
  24 #include "mkldnn_extension_utils.h"
  25 #include "mkldnn_extension_mngr.h"
  26 #include "mkldnn/omp_manager.h"
  27 #include <graph_tools.hpp>
  28 #include <cpp_interfaces/ie_executor_manager.hpp>
  29 #include "ie_algorithm.hpp"
  30 #include "memory_solver.hpp"
  31 #include "mkldnn_infer_request.h"
  32 #include "mkldnn_async_infer_request.h"
  33 #include <blob_factory.hpp>
  34 #include <ie_util_internal.hpp>
  35 #include <net_pass.h>
  36 #include <details/ie_cnn_network_tools.h>
  37
  38 #include <mkldnn_graph_dumper.h>
  39
  40 #include <data_stats.h>
  41 #include "cnn_network_int8_normalizer.hpp"
  42 #include "ie_memcpy.h"
  43
  44 #define XBYAK_NO_OP_NAMES
  45 #define XBYAK_UNDEF_JNL
  46 #include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
  47
  48 #include "cnn_network_stats_impl.hpp"
  49
  50 #include "utils/blob_dump.h"
  51
  52 /*****************************************************
  53  * Debug capability
  54  *  - BLOB_DUMP_PATH : Specify with existing folder name
  55  *    to dump intermediate blobs into it
  56  *  - PRINT_GRAPH_INFO : Define it to enable printing
  57  *    additional information to std output.
  58  *
  59  *****************************************************/
  60 // #define BLOB_DUMP_PATH "mkldnn_dump"
  61 // #define PRINT_GRAPH_INFO
  62 // #define DUMP_AS_TEXT
  63
  64 #ifdef BLOB_DUMP_PATH
  65 #   define DUMP_DIR        BLOB_DUMP_PATH
  66 #   define ENABLE_DUMP(_x) { _x ;}
  67 #else
  68 #   define DUMP_DIR ""
  69 #   define ENABLE_DUMP(_x)
  70 #endif
  71
  72 using namespace mkldnn;
  73 using namespace MKLDNNPlugin;
  74 using namespace MKLDNNPlugin::cpu;
  75 using namespace InferenceEngine;
  76 using namespace InferenceEngine::details;
  77
  78 void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
  79     if (IsReady())
  80         ForgetGraphData();
  81
  82     Replicate(network, extMgr);
  83     InitGraph();
  84     status = Ready;
  85 }
  86
  87 void MKLDNNGraph::Replicate(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
  88     InputsDataMap inputs;
  89     network.getInputsInfo(inputs);
  90     if (inputs.empty()) {
  91         THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: No inputs for the topology";
  92     }
  93
  94     // The input layer precision has to be equal to the InputData precision
  95     for (const auto& input : inputs) {
  96         auto inputLayer = input.second->getInputData()->getCreatorLayer().lock();
  97         if (inputLayer) inputLayer->precision = inputLayer->outData[0]->precision;
  98     }
  99
 100     std::unordered_map<CNNLayerPtr, MKLDNNNodePtr> layer2node;
 101
 102     auto _parent_port = [] (const DataPtr &data) -> int {
 103         auto parent = data->creatorLayer.lock();
 104         for (int i = 0; parent->outData.size(); i++)
 105             if (data == parent->outData[i])
 106                 return i;
 107         return -1;
 108     };
 109
 110     // Replicate All Nodes in topological order
 111     for (const auto layer : CNNNetSortTopologically(network)) {
 112         CNNLayerPtr _layer = layer;
 113         if (layer->type == "Memory" && layer->GetParamAsString("index") == "1") {
 114             auto memoryId = layer->GetParamAsString("id");
 115             _layer.reset(new CNNLayer({layer->name + "/id=" + memoryId, "MemoryInput", layer->precision}));
 116             _layer->params = layer->params;
 117             _layer->outData = layer->outData;
 118         }
 119
 120         const MKLDNNNodePtr node(MKLDNNNode::CreateNode(_layer, getEngine(), extMgr));
 121         graphNodes.push_back(node);
 122         layer2node[layer] = node;
 123
 124         for (int port = 0; port < layer->insData.size(); port++) {
 125             auto data = layer->insData[port].lock();
 126             auto parent_layer = data->creatorLayer.lock();
 127             if (!parent_layer) continue;  // no parent means that it is input data node (or memory/const layer)
 128
 129             auto parent_node = layer2node[parent_layer];
 130
 131             MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), port));
 132             node->addEdge(edge);
 133             graphEdges.push_back(edge);
 134         }
 135     }
 136
 137     std::map<std::string, DataPtr> outputs;
 138     network.getOutputsInfo(outputs);
 139
 140     for (const auto &output : outputs) {
 141         const auto data = output.second;
 142
 143         auto parent_layer = data->creatorLayer.lock();
 144         auto parent_node = layer2node[parent_layer];
 145
 146         CNNLayerPtr layer(new CNNLayer({"out_" + output.first, "Output", data->precision}));
 147         layer->insData.push_back(data);
 148
 149         const MKLDNNNodePtr node(MKLDNNNode::CreateNode(layer, getEngine(), extMgr));
 150
 151         MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), 0));
 152         node->addEdge(edge);
 153         graphEdges.push_back(edge);
 154
 155         graphNodes.push_back(node);
 156         outputNodes.push_back(node);
 157         layer2node[layer] = node;
 158     }
 159
 160     // Replicate input nodes
 161     for (const auto& input : inputs) {
 162         auto inputLayer = input.second->getInputData()->getCreatorLayer().lock();
 163         inputNodes[input.first] = layer2node[inputLayer];
 164
 165         // Loading mean images
 166         MKLDNNDims outDims(inputNodes[input.first]->getChildEdgeAt(0)->getDims());
 167         if (inputs.find(input.first) != inputs.end()) {
 168             InputInfo::Ptr ii = inputs[input.first];
 169             if (ii && ii->getPreProcess().getNumberOfChannels()) {
 170                 _meanImages[input.first].Load(outDims, ii);
 171             }
 172         }
 173     }
 174 }
 175
 176 void MKLDNNGraph::InitGraph() {
 177     SortTopologically();
 178     MKLDNNGraphOptimizer optimizer;
 179     optimizer.ApplyCommonGraphOptimizations(*this);
 180     SortTopologically();
 181
 182     InitNodes();
 183
 184     for (auto &node : graphNodes) {
 185         node->initOptimalPrimitiveDescriptor();
 186     }
 187     InitEdges();
 188
 189     optimizer.ApplyImplSpecificGraphOptimizations(*this);
 190
 191     SortTopologically();
 192
 193     Allocate();
 194
 195     CreatePrimitives();
 196
 197     // Do it before cleanup. Because it will lose original layers information
 198     for (auto &graphNode : graphNodes) {
 199         auto nodeType = graphNode->getType();
 200         if (nodeType == Reorder || nodeType == Output) continue;
 201
 202         graphNode->addOriginalLayer(graphNode->getCnnLayer());
 203         if (graphNode->getFusedWith().size() || graphNode->getMergeWith().size()) {
 204             // Original layer names
 205             std::vector<MKLDNNNodePtr> internal = graphNode->getFusedWith();
 206             auto &merged = graphNode->getMergeWith();
 207             internal.insert(internal.end(), merged.begin(), merged.end());
 208
 209             for (auto &sub_node : internal) {
 210                 graphNode->addOriginalLayer(sub_node->getCnnLayer());
 211             }
 212         }
 213     }
 214     if (!config.dumpToDot.empty())
 215         dumpToDotFile(config.dumpToDot + "_init.dot");
 216
 217     for (auto &graphNode : graphNodes) {
 218         graphNode->cleanup();
 219     }
 220
 221 #if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO)
 222     for (auto &graphNode : graphNodes) {
 223         std::cout << "name: " << graphNode->getName() << " [ ";
 224         if (graphNode->parentEdges.size() > 0) {
 225             auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc();
 226             std::cout << "in: " << prnt_out_desc.getPrecision().name()
 227                       << "/l=" << prnt_out_desc.getLayout()
 228                     << "; ";
 229         }
 230         if (graphNode->childEdges.size() > 0) {
 231             auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc();
 232             std::cout << "out: " << chld_in_desc.getPrecision().name()
 233                       << "/l=" << chld_in_desc.getLayout();
 234         }
 235         std::cout << " ]"  << std::endl;
 236     }
 237 #endif
 238
 239     mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
 240     for (auto &graphNode : graphNodes) {
 241         if (!graphNode->isConstant())
 242             continue;
 243         graphNode->execute(stream);
 244     }
 245 }
 246
 247 void MKLDNNGraph::InitNodes() {
 248     for (auto &node : graphNodes) {
 249         if (node->getType() == Input && _meanImages.find(node->getName()) != _meanImages.end()) {
 250             auto *inputNode = dynamic_cast<MKLDNNInputNode *>(node.get());
 251             if (inputNode)
 252                 inputNode->withMeanImage();
 253         }
 254         node->getSupportedDescriptors();
 255
 256         node->initSupportedPrimitiveDescriptors();
 257     }
 258
 259     for (auto &node : graphNodes) {
 260         node->selectOptimalPrimitiveDescriptor();
 261     }
 262 }
 263
 264 void MKLDNNGraph::InitEdges() {
 265     auto reorderArgs = [](InferenceEngine::TensorDesc parentDesc, InferenceEngine::TensorDesc childDesc) {
 266         std::string inArgs, outArgs;
 267         if (parentDesc.getPrecision() != childDesc.getPrecision()) {
 268             inArgs += (inArgs.empty() ? "" : "_") + std::string(parentDesc.getPrecision().name());
 269             outArgs += (outArgs.empty() ? "" : "_") + std::string(childDesc.getPrecision().name());
 270         }
 271         if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
 272             inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
 273             outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
 274         }
 275         return inArgs + "_" + outArgs;
 276     };
 277     size_t numberOfEdges = graphEdges.size();
 278     for (auto i = 0; i < numberOfEdges; i++) {
 279         if (graphEdges[i]->needReorder()) {
 280             auto &edge = graphEdges[i];
 281             std::string layerName = edge->getParent()->getName() + "_" +
 282                                     reorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" +
 283                                     edge->getChild()->getName();
 284             CNNLayerPtr layer(new CNNLayer({layerName,
 285                                             "Reorder",
 286                                             edge->getInputDesc().getPrecision()}));
 287             MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layer, getEngine()));
 288             auto *reorderPtr = dynamic_cast<MKLDNNReorderNode *>(newReorder.get());
 289             if (reorderPtr) {
 290                 reorderPtr->setDescs(edge->getInputDesc(), edge->getOutputDesc());
 291             }
 292
 293             auto oIndex = edge->getOutputNum();
 294             auto iIndex = edge->getInputNum();
 295             if (iIndex < 0 || oIndex < 0)
 296                 THROW_IE_EXCEPTION << "Cannot create reorder for nodes: "
 297                                    << edge->getParent()->getName() << " and "
 298                                    << edge->getChild()->getName() << ".";
 299
 300             edge->drop();
 301
 302             MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder, iIndex, 0));
 303             MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild(), 0, oIndex));
 304
 305             // Add edge for beforeNode
 306             beforeNode->getChild()->parentEdges.push_back(beforeNode);
 307             edge->getParent()->childEdges.push_back(beforeNode);
 308
 309             // Add edge for afterNode
 310             afterNode->getParent()->childEdges.push_back(afterNode);
 311             edge->getChild()->parentEdges.push_back(afterNode);
 312
 313             newReorder->getSupportedDescriptors();
 314             newReorder->initSupportedPrimitiveDescriptors();
 315             newReorder->selectOptimalPrimitiveDescriptor();
 316
 317             graphEdges.push_back(beforeNode);
 318             graphEdges.push_back(afterNode);
 319
 320             graphNodes.push_back(newReorder);
 321             graphEdges.erase(graphEdges.begin() + i);
 322             i--;
 323             numberOfEdges--;
 324         }
 325     }
 326 }
 327
 328 static inline bool isConstOutput(MKLDNNEdgePtr edge) {
 329     return edge->getParent()->isConstant() && !edge->getChild()->isConstant();
 330 }
 331
 332 void MKLDNNGraph::AllocateWithReuse() {
 333     std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
 334
 335     // detect edge clusters which are view on one.
 336     for (auto &edge : graphEdges) {
 337         MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
 338                             ? edge->getSharedEdge()
 339                             : nullptr;
 340         if (par) {
 341             bool found = false;
 342             for (auto &claster : edge_clasters) {
 343                 for (auto &element : claster) {
 344                     if (element == par) {
 345                         if (std::find(claster.begin(), claster.end(), edge) == claster.end())
 346                             claster.push_back(edge);
 347                         found = true;
 348                         break;
 349                     }
 350                 }
 351             }
 352             if (!found)
 353                 edge_clasters.push_back({par, edge});
 354         } else {
 355             bool found = false;
 356             for (auto &claster : edge_clasters) {
 357                 for (auto &element : claster) {
 358                     if (element == edge) {
 359                         found = true;
 360                         break;
 361                     }
 362                 }
 363             }
 364             if (!found)
 365                 edge_clasters.push_back({edge});
 366         }
 367     }
 368
 369     //======= WA. getSharedEdge() returns not identical edges ============
 370     //  Will try to merge clasters with matched edges
 371     for (auto &edge : graphEdges) {
 372         std::vector<decltype(&edge_clasters[0])> to_merge;
 373
 374         for (auto &claster : edge_clasters)
 375             if (std::find(claster.begin(), claster.end(), edge) != claster.end())
 376                 to_merge.push_back(&claster);
 377
 378         if (to_merge.size() > 1) {
 379             // Merge clasters
 380             auto base_classter = to_merge[0];
 381             for (int i = 1; i < to_merge.size(); i++) {
 382                 base_classter->insert(base_classter->end(),
 383                                       to_merge[i]->begin(), to_merge[i]->end());
 384                 to_merge[i]->clear();
 385             }
 386
 387             // remove duplicates in merged claster
 388             std::sort(base_classter->begin(), base_classter->end());
 389             base_classter->erase(std::unique(base_classter->begin(), base_classter->end()),
 390                                  base_classter->end() );
 391
 392             // remove empty clasters
 393             edge_clasters.erase(std::remove_if(edge_clasters.begin(), edge_clasters.end(),
 394                                                [] ( std::vector<MKLDNNEdgePtr> &cls) { return cls.empty(); }),
 395                                 edge_clasters.end());
 396         }
 397     }
 398     //======= End of WA ============
 399
 400     const int64_t alignment = 32;  // 32 bytes
 401
 402     std::vector<MemorySolver::Box> boxes(edge_clasters.size());
 403     for (int i = 0; i < edge_clasters.size(); i++) {
 404         MemorySolver::Box &box = boxes[i];
 405         box = { std::numeric_limits<int>::max(), 0, 0, i };
 406         for (auto &edge : edge_clasters[i]) {
 407             int e_start = edge->getParent()->execIndex;
 408             int e_finish = edge->getChild()->execIndex;
 409
 410             const BlockingDesc block_desk = edge->getDesc().getBlockingDesc();
 411
 412             int64_t e_size = block_desk.getOffsetPadding() + 1;  // size in bytes (from begin of data to last element)
 413             for (int j = 0; j < block_desk.getBlockDims().size(); j++)
 414                 e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j];
 415
 416             e_size *= edge->getDesc().getPrecision() == Precision::BIN ? 1 : edge->getDesc().getPrecision().size();
 417
 418             box.start = std::min(e_start, box.start);
 419             box.finish = std::max(e_finish, box.finish);
 420             box.size =  std::max(e_size, box.size);
 421         }
 422
 423         // Constant data are filled once on load.
 424         // So we need it untouchable during all execution time
 425         // -1 is a place holder for a max timestamp.
 426         bool isConst = false, isOutput = false, isInput = false;
 427         for (auto &edge : edge_clasters[i]) {
 428             isConst  |= isConstOutput(edge);
 429             isOutput |= edge->getChild()->getType() == Output;
 430             isInput  |= edge->getParent()->getType() == Input;
 431
 432             // WA. MemoryOutput will keep data in that edge
 433             // So need to make it immortal..
 434             isConst |= edge->getParent()->getType() == MemoryInput;
 435         }
 436
 437         if (isInput  | isConst) box.start = 0;
 438         if (isOutput | isConst) box.finish = -1;
 439
 440         box.size = div_up(box.size, alignment);
 441     }
 442
 443     MemorySolver memSolver(boxes);
 444     size_t total_size = static_cast<size_t>(memSolver.solve()) * alignment;
 445
 446     memWorkspace = std::make_shared<MKLDNNMemory>(eng);
 447     memWorkspace->Create(MKLDNNMemoryDesc(TensorDesc(Precision::I8, {total_size}, Layout::C)));
 448     auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());
 449
 450     for (int i = 0; i < edge_clasters.size(); i++) {
 451         int count = 0;
 452         for (auto &edge : edge_clasters[i]) {
 453             if (edge->getStatus() == MKLDNNEdge::Status::NeedAllocation) {
 454                 int64_t offset = memSolver.getOffset(i);
 455                 // !! Fallback to individual memory allocation !!
 456                 // if you like to check infer without reuse just call this function without arguments.
 457                 edge->allocate(workspace_ptr + offset * alignment);  // alignment in byte
 458                 count++;
 459             }
 460         }
 461         IE_ASSERT(count == 1);
 462     }
 463 }
 464
 465 void MKLDNNGraph::Allocate() {
 466     // resolve edges. Define which will be a view on others
 467     //   NeedAllocation - real blob
 468     //   NotAllocated - view on other blob, peer or in-place
 469     for (auto& edge : graphEdges) edge->init();
 470
 471     // Allocate memory space for all edges marked with NeedAllocation
 472     AllocateWithReuse();
 473
 474     // Resolve all other edges with status NotAllocated or in-place
 475     for (auto& node : graphNodes) node->resolveNotAllocatedEdges();
 476
 477     // Check all getters. Should work.
 478     for (auto& edge : graphEdges) edge->validate();
 479 }
 480
 481 void MKLDNNGraph::CreatePrimitives() {
 482     for (auto& node : graphNodes) {
 483         node->createPrimitive();
 484     }
 485 }
 486
 487 void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::Blob::Ptr &in) {
 488     if (!IsReady()) THROW_IE_EXCEPTION<< "Wrong state. Topology not ready.";
 489
 490     auto input = inputNodes.find(name);
 491     if (input != inputNodes.end()) {
 492         MKLDNNDims outDims = input->second->getChildEdgeAt(0)->getDims();
 493
 494         const void *ext_data_ptr = in->cbuffer();
 495         void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();
 496
 497         if (ext_data_ptr != inter_data_ptr) {
 498             auto l = in->getTensorDesc().getLayout();
 499             if (l == CHW && input->second->getChildEdgeAt(0)->getDims().ndims() == 4)
 500                 l = NCHW;
 501
 502             input->second->getChildEdgeAt(0)->getMemory().SetData(
 503                     MKLDNNExtensionUtils::IEPrecisionToDataType(in->getTensorDesc().getPrecision()),
 504                     MKLDNNMemory::Convert(l), ext_data_ptr, in->byteSize(), false);
 505         }
 506
 507         // todo: make sure 'name' exists in this map...
 508         if (_meanImages.find(name) != _meanImages.end()) {
 509             if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
 510                 _meanImages[name].Subtract(outDims, reinterpret_cast<float *>(inter_data_ptr), in->getTensorDesc().getLayout());
 511             } else {
 512                 THROW_IE_EXCEPTION << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported";
 513             }
 514         }
 515     } else {
 516         THROW_IE_EXCEPTION << "Input blob for infer '" << name << "' doesn't correspond to input in network";
 517     }
 518 }
 519
 520 void MKLDNNGraph::PullOutputData(BlobMap &out) {
 521     if (!IsReady())
 522         THROW_IE_EXCEPTION << "Wrong state. Topology not ready.";
 523
 524     for (MKLDNNNodePtr &node : outputNodes) {
 525         // remove out_ from node name
 526         std::string name = node->getName().substr(4);
 527         const MKLDNNMemory& intr_blob = node->getParentEdgeAt(0)->getMemory();
 528         if (out.find(name) == out.end()) {
 529             // TODO: Create blob from MemoryDesc
 530             Blob::Ptr outBlob = make_shared_blob<float>({Precision::FP32, node->getParentEdgeAt(0)->getDims().ToSizeVector(),
 531                                                          TensorDesc::getLayoutByDims(node->getParentEdgeAt(0)->getDims().ToSizeVector())},
 532                                                         reinterpret_cast<float*>(intr_blob.GetData()));
 533             out[name] = outBlob;
 534         }
 535
 536         Blob::Ptr &ext_blob = out[name];
 537
 538         // TODO: Why we allow allocation of output memory inside Infer call??
 539         // Suggestion is to disable this behaviour
 540         if (ext_blob->buffer() == nullptr) {
 541             SizeVector dims = node->getParentEdgeAt(0)->getDims().ToSizeVector();
 542             std::reverse(dims.begin(), dims.end());  // Blobs dims are in reverse order (legacy of OpenVX :-( )
 543             ext_blob->Resize(dims);
 544             ext_blob->allocate();
 545         }
 546
 547         if (ext_blob->byteSize() != intr_blob.GetSize())
 548             THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
 549                                << ext_blob->size() << "!=" << intr_blob.GetSize()/sizeof(float) << ").";
 550
 551         void *ext_blob_ptr = ext_blob->buffer();
 552         void *intr_blob_ptr = intr_blob.GetData();
 553
 554         // That is the same memory. No need to copy
 555         if (ext_blob_ptr == intr_blob_ptr) continue;
 556
 557         int MB = intr_blob.GetDims()[0];
 558         int MB_to_process = node->batchToProcess();
 559         // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
 560         if (config.batchLimit)
 561             MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
 562         size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
 563
 564         ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
 565     }
 566 }
 567
 568 void MKLDNNGraph::Infer(int batch) {
 569     if (!IsReady()) {
 570         THROW_IE_EXCEPTION << "Wrong state. Topology is not ready.";
 571     }
 572
 573     mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
 574     for (int i = 0; i < graphNodes.size(); i++) {
 575         PERF(graphNodes[i]);
 576
 577         if (batch > 0)
 578             graphNodes[i]->setDynamicBatchLim(batch);
 579
 580         ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i]));
 581
 582         if (!graphNodes[i]->isConstant()) {
 583             IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask)
 584             graphNodes[i]->execute(stream);
 585         }
 586
 587         ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
 588     }
 589 }
 590
 591 void MKLDNNGraph::VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes) {
 592     if (node->temporary) {
 593         return;
 594     }
 595
 596     if (node->permanent) {
 597         return;
 598     }
 599
 600     node->temporary = true;
 601
 602     for (size_t i = 0; i < node->getChildEdges().size(); i++) {
 603         VisitNode(node->getChildEdgeAt(i)->getChild(), sortedNodes);
 604     }
 605
 606     node->permanent = true;
 607     node->temporary = false;
 608
 609     sortedNodes.insert(sortedNodes.begin(), node);
 610 }
 611
 612 void MKLDNNGraph::SortTopologically() {
 613     std::vector<MKLDNNNodePtr> unsorted;
 614     std::vector<MKLDNNNodePtr> sorted;
 615
 616     for (int i = 0; i < graphNodes.size(); i++) {
 617         MKLDNNNodePtr node = graphNodes[i];
 618
 619         node->permanent = false;
 620         node->temporary = false;
 621
 622         unsorted.push_back(node);
 623     }
 624
 625     while (!unsorted.empty()) {
 626         MKLDNNNodePtr node = unsorted.at(0);
 627         unsorted.erase(unsorted.begin());
 628
 629         VisitNode(node, sorted);
 630     }
 631
 632     for (int i = 0; i < sorted.size(); i++) sorted[i]->execIndex = i;
 633
 634     graphNodes.erase(graphNodes.begin(), graphNodes.end());
 635     graphNodes.assign(sorted.begin(), sorted.end());
 636
 637     // TODO: Sort in/out edges by port index because of backward compatibility
 638     //       A lot of plugin logic are build on top of assumption that index in
 639     //       vector childEdges/parentEdges is port number. But that is not
 640     //       truth anymore. But to keep old logic correct need to simulate ordering.
 641     //
 642     // Make first N (N == port_num) edge indexes are matched with port index
 643     for (auto &node : graphNodes) {
 644         {
 645             int port_num = node->inDims.size();
 646             std::vector<MKLDNNEdgePtr> res(port_num);
 647
 648             for (int i = 0; i < node->parentEdges.size(); i++) {
 649                 auto edge = node->getParentEdgeAt(i);
 650                 int port = edge->getOutputNum();
 651                 if (!res[port])
 652                     res[port] = edge;
 653                 else
 654                     res.push_back(edge);
 655             }
 656             node->parentEdges = {res.begin(), res.end()};
 657         }
 658         {
 659             int port_num = node->outDims.size();
 660             std::vector<MKLDNNEdgePtr> res(port_num);
 661
 662             for (int i = 0; i < node->childEdges.size(); i++) {
 663                 auto edge = node->getChildEdgeAt(i);
 664                 int port = edge->getInputNum();
 665                 if (!res[port])
 666                     res[port] = edge;
 667                 else
 668                     res.push_back(edge);
 669             }
 670             node->childEdges = {res.begin(), res.end()};
 671         }
 672     }
 673 }
 674
 675 void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const {
 676     unsigned i = 0;
 677     std::function<void(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &, const MKLDNNNodePtr&)>
 678             getPerfMapFor = [&](std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap, const MKLDNNNodePtr& node) {
 679         InferenceEngine::InferenceEngineProfileInfo &pc = perfMap[node->getName()];
 680         pc.execution_index = i++;
 681         // TODO: Why time counter is signed?
 682         pc.cpu_uSec = pc.realTime_uSec = (long long) node->PerfCounter().avg();
 683         pc.status = pc.cpu_uSec > 0 ? InferenceEngine::InferenceEngineProfileInfo::EXECUTED
 684                                     : InferenceEngine::InferenceEngineProfileInfo::NOT_RUN;
 685         std::string pdType = node->getPrimitiveDescriptorType();
 686         size_t typeLen = sizeof(pc.exec_type) / sizeof(pc.exec_type[0]);
 687         pdType.copy(pc.exec_type, typeLen, 0);
 688         size_t layerTypeLen = sizeof(pc.layer_type) / sizeof(pc.layer_type[0]);
 689         node->typeStr.copy(pc.layer_type, layerTypeLen, 0);
 690
 691         for (auto& fusedNode : node->fusedWith) {
 692             getPerfMapFor(perfMap, fusedNode);
 693         }
 694
 695         for (auto& mergedWith : node->mergedWith) {
 696             getPerfMapFor(perfMap, mergedWith);
 697         }
 698     };
 699
 700     for (int i = 1; i < graphNodes.size(); i++) {
 701         getPerfMapFor(perfMap, graphNodes[i]);
 702     }
 703
 704     if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
 705 }
 706
 707 void MKLDNNGraph::setConfig(const Config &cfg) {
 708     config = cfg;
 709 }
 710
 711 void MKLDNNGraph::setProperty(const std::map<std::string, std::string>& properties) {
 712     config.readProperties(properties);
 713 }
 714
 715 Config MKLDNNGraph::getProperty() {
 716     return config;
 717 }
 718
 719 void MKLDNNGraph::getInputBlobs(InferenceEngine::BlobMap &resp) {
 720     for (auto &it : inputNodes) {
 721         MKLDNNInputNode* node = dynamic_cast<MKLDNNInputNode*>(it.second.get());
 722         if (!node || node->isConstant())
 723             continue;
 724         resp[it.first] = node->getChildEdgeAt(0)->getBlob();
 725     }
 726 }
 727
 728 void MKLDNNGraph::getOutputBlobs(InferenceEngine::BlobMap &resp) {
 729     for (auto &it : outputNodes) {
 730         std::string name = it->getName().substr(4);
 731         resp[name] = it->getParentEdgeAt(0)->getBlob();
 732     }
 733 }
 734
 735 void MKLDNNGraph::DropNode(const MKLDNNNodePtr &node) {
 736     auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
 737         auto& edges = graph.GetEdges();
 738         for (auto it = edges.begin(); it != edges.end(); it++) {
 739             if ((*it) == edge) {
 740                 edges.erase(it);
 741                 return;
 742             }
 743         }
 744     };
 745
 746     auto childs = node->childEdges;
 747     auto parents = node->parentEdges;
 748
 749     for (size_t i = 0; i < parents.size(); i++) {
 750         auto p_edge = parents[i].lock();
 751         if (!p_edge) continue;
 752         auto parent = p_edge->getParent();
 753         if (!parent) continue;
 754
 755         for (size_t j = 0; j < childs.size(); j++) {
 756             if (!childs[j].lock())
 757                 continue;
 758             auto child = childs[j].lock()->getChild();
 759             if (!child)
 760                 continue;
 761
 762             MKLDNNEdgePtr &remEdge = p_edge;
 763             int inNum = 0;
 764             if (remEdge) {
 765                 inNum = remEdge->getInputNum();
 766                 remEdge->drop();
 767                 removeEdge(*this, remEdge);
 768             }
 769             remEdge = childs[j].lock();
 770             int outNum = 0;
 771             if (remEdge) {
 772                 outNum = remEdge->getOutputNum();
 773                 remEdge->drop();
 774                 removeEdge(*this, remEdge);
 775             }
 776             MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
 777             graphEdges.push_back(newEdge);
 778             parent->addEdge(newEdge);
 779         }
 780     }
 781 }
 782
 783 void MKLDNNGraph::RemoveDroppedNodes() {
 784     auto& nodes = this->GetNodes();
 785
 786     auto it = nodes.begin();
 787
 788     while (it != nodes.end()) {
 789         if ((*it)->isDropped()) {
 790             it = nodes.erase(it);
 791         } else {
 792             it++;
 793         }
 794     }
 795 }
 796
 797 void MKLDNNGraph::RemoveDroppedEdges() {
 798     auto& edges = this->GetEdges();
 799
 800     auto it = edges.begin();
 801
 802     while (it != edges.end()) {
 803         if ((*it)->isDropped()) {
 804             it = edges.erase(it);
 805         } else {
 806             it++;
 807         }
 808     }
 809 }
 810
 811 void MKLDNNGraph::dumpToDotFile(std::string file) const {
 812     std::ofstream dot;
 813     dot.open(file);
 814     if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << ".";
 815
 816     dump_graph_as_dot(*this, dot);
 817     dot.close();
 818 }
 819
 820 void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) {
 821     auto exec_order = std::to_string(node->execIndex);
 822     std::string nodeName = node->name;
 823     std::replace(nodeName.begin(), nodeName.end(), '\\', '_');
 824     std::replace(nodeName.begin(), nodeName.end(), '/', '_');
 825     std::replace(nodeName.begin(), nodeName.end(), ' ', '_');
 826     std::replace(nodeName.begin(), nodeName.end(), ':', '_');
 827
 828     auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size();
 829     for (size_t i = 0; i < num_ports; i++) {
 830         auto prEdge = node->getParentEdgeAt(i);
 831         auto pr = prEdge->getParent();
 832
 833         auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_in" + std::to_string(i) + ".ieb";
 834         TensorDesc desc = prEdge->getDesc();
 835         if (desc.getPrecision() == Precision::BIN)
 836             return;
 837         Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData());
 838
 839         BlobDumper dumper(blob);
 840         if (pr->ext_scales) dumper.withScales(pr->ext_scales);
 841 #ifdef DUMP_AS_TEXT
 842         dumper.dumpAsTxt(dump_file);
 843 #else
 844         dumper.dump(dump_file);
 845 #endif
 846     }
 847 }
 848
 849 void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) {
 850     auto exec_order = std::to_string(node->execIndex);
 851     auto nodeName = node->name;
 852     std::replace(nodeName.begin(), nodeName.end(), '\\', '_');
 853     std::replace(nodeName.begin(), nodeName.end(), '/', '_');
 854     std::replace(nodeName.begin(), nodeName.end(), ' ', '_');
 855     std::replace(nodeName.begin(), nodeName.end(), ':', '_');
 856
 857     auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size();
 858     for (size_t i = 0; i < num_ports; i++) {
 859         auto childEdge = node->getChildEdgeAt(i);
 860
 861         auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_out" + std::to_string(i) + ".ieb";
 862         TensorDesc desc = childEdge->getDesc();
 863         if (desc.getPrecision() == Precision::BIN)
 864             return;
 865         Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData());
 866
 867         BlobDumper dumper(blob);
 868         if (node->ext_scales) dumper.withScales(node->ext_scales);
 869
 870 #ifdef DUMP_AS_TEXT
 871         dumper.dumpAsTxt(dump_file);
 872 #else
 873         dumper.dump(dump_file);
 874 #endif
 875     }
 876 }
 877
 878 InferenceEngine::ICNNNetwork::Ptr MKLDNNGraph::dump() const {
 879     return dump_graph_as_ie_net(*this);
 880 }
 881
 882 bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const {
 883     InputsDataMap inputs;
 884     network.getInputsInfo(inputs);
 885
 886     CNNLayerSet inputLayers;
 887     std::unordered_set<CNNLayer *> allLayers;
 888
 889     if (inputs.empty())
 890         return false;
 891
 892     auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
 893     if (secondLayers.empty())
 894         return false;
 895
 896     bool check_result = true;
 897     details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
 898         auto type = TypeFromName(layer->type);
 899         // This is WA for Tile layer
 900         auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
 901         if (tileLayer && tileLayer->axis)
 902             return;
 903
 904         if (type != Input &&
 905             type != Output &&
 906             type != Convolution &&
 907             type != Deconvolution &&
 908             type != Activation &&
 909             type != Depthwise &&
 910             type != Lrn &&
 911             type != Pooling &&
 912             type != FullyConnected &&
 913             type != Gemm &&
 914             type != SoftMax &&
 915             type != Split &&
 916             type != Concatenation &&
 917             type != Power &&
 918             type != Eltwise &&
 919             type != Crop &&
 920             type != BatchNormalization &&
 921             type != Copy) {
 922             check_result = false;
 923         }
 924     }, false);
 925
 926     return check_result;
 927 }
 928
 929 InferenceEngine::InferRequestInternal::Ptr
 930 MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
 931                                           InferenceEngine::OutputsDataMap networkOutputs) {
 932     if (graphs.size() > 1)  // streams uses special requests that are not connected to graphs
 933         return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs);
 934     else
 935         return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
 936 }
 937
 938 MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
 939                                      const Config &cfg,
 940                                      const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) {
 941     ICNNNetworkStats* pstats = nullptr;
 942     StatusCode s = network.getStats(&pstats, nullptr);
 943     // we are cloning network if we have statistics and we can transform network.
 944     auto clonedNetwork = cloneNet(network);
 945
 946     if (s == StatusCode::OK && pstats && !pstats->isEmpty()) {
 947         CNNNetworkInt8Normalizer cnnorm;
 948         cnnorm.NormalizeNetwork(*clonedNetwork, *pstats);
 949     }
 950
 951     bool ti_proc_ok = !NetPass::CombineRNNSeq(*clonedNetwork) ? NetPass::UnrollTI(*clonedNetwork) : true;
 952     ti_proc_ok &= NetPass::UnrollRNN_if(*clonedNetwork, [] (RNNCellBase rnn) -> bool {
 953         if (rnn.clip != 0.0f)
 954             return true;
 955         if ((rnn.cellType == RNNCellBase::GRU || rnn.cellType == RNNCellBase::GRU_LBR) &&
 956                 rnn.activations != std::vector<std::string> {"sigmoid", "tanh"})
 957             return true;
 958         if (rnn.cellType == RNNCellBase::LSTM &&
 959                 rnn.activations != std::vector<std::string> {"sigmoid", "tanh", "tanh"})
 960             return true;
 961         return false;
 962     });
 963     if (!ti_proc_ok)
 964         THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
 965                               "None TI optimization pattern has been applied successfully";
 966
 967
 968     if (cfg.batchLimit > 1) {
 969         // check topology for applicability
 970         if (!CanProcessDynBatch(*clonedNetwork)) {
 971             THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
 972         }
 973     }
 974     // check whether any (affinity-related) envs are set and if user requested thread binding
 975     const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding;
 976     // general #threads logic
 977     const int env_threads = parallel_get_env_threads();
 978     // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual
 979     const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
 980     const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores);
 981     const int threads_per_stream = std::max(1, threads/cfg.throughputStreams);
 982
 983     // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams)
 984     std::vector<Task::Ptr> tasks;
 985
 986     for (int n = 0; n < cfg.throughputStreams; n++) {
 987         MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>();
 988         graphs.push_back(_graph);
 989         auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() {
 990             _graph->CreateArena(threads_per_stream);
 991
 992             if (bPinningRequested) {
 993                 _graph->CreateObserver(n, threads_per_stream);
 994             }
 995
 996             _graph->setConfig(cfg);
 997             _graph->CreateGraph(*clonedNetwork, extensionManager);
 998             if (cfg.throughputStreams > 1)  // for streams, each worker thread has it's own graph
 999                 MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph;
1000         });
1001         tasks.push_back(task);
1002     }
1003
1004     if (cfg.throughputStreams > 1) {
1005         // special executor with as many threads as requested #streams, each with it's own initialization task
1006         _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks);
1007     } else {
1008         if (cfg.exclusiveAsyncRequests) {
1009             // special case when all InferRequests are muxed into a single queue
1010             ExecutorManager *executorManager = ExecutorManager::getInstance();
1011             _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
1012         }
1013         _taskExecutor->startTask(tasks[0]);
1014         Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
1015     }
1016     for (auto t : tasks)
1017         t->checkException();
1018 }
1019
1020 void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
1021     for (auto g : graphs)
1022         g->setProperty(properties);
1023 }
1024
1025 void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
1026     auto syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs);
1027     syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
1028     auto asyncRequestImpl = std::make_shared<MKLDNNAsyncInferRequest>(syncRequestImpl, _taskExecutor,
1029                                                                       _taskSynchronizer, _callbackExecutor);
1030     asyncRequest.reset(new InferRequestBase<MKLDNNAsyncInferRequest>(asyncRequestImpl),
1031                        [](IInferRequest *p) { p->Release(); });
1032
1033     asyncRequestImpl->SetPointerToPublicInterface(asyncRequest);
1034
1035     if (graphs.size() == 1) {  // single-stream (legacy/hetero) case - single graph for all requests
1036         auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
1037         if (!mkldnnSyncRequest)
1038             THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
1039         mkldnnSyncRequest->SetGraph(graphs[0]);
1040     }
1041 }
1042
1043 void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) {
1044     graphPtr = graphs[0]->dump();
1045 }