1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
9 #include <unordered_set>
12 #include <unordered_map>
14 #include "details/caseless.hpp"
16 #include "mkldnn_graph.h"
17 #include "mkldnn_graph_optimizer.h"
19 #include <nodes/mkldnn_input_node.h>
20 #include <nodes/mkldnn_reorder_node.h>
21 #include <nodes/mkldnn_depthwise_node.h>
22 #include <nodes/mkldnn_conv_node.h>
24 #include "mkldnn_extension_utils.h"
25 #include "mkldnn_extension_mngr.h"
26 #include "mkldnn/omp_manager.h"
27 #include <graph_tools.hpp>
28 #include <cpp_interfaces/ie_executor_manager.hpp>
29 #include "ie_algorithm.hpp"
30 #include "memory_solver.hpp"
31 #include "mkldnn_infer_request.h"
32 #include "mkldnn_async_infer_request.h"
33 #include <blob_factory.hpp>
34 #include <ie_util_internal.hpp>
36 #include <details/ie_cnn_network_tools.h>
38 #include <mkldnn_graph_dumper.h>
40 #include <data_stats.h>
41 #include "cnn_network_int8_normalizer.hpp"
42 #include "ie_memcpy.h"
44 #define XBYAK_NO_OP_NAMES
45 #define XBYAK_UNDEF_JNL
46 #include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
48 #include "cnn_network_stats_impl.hpp"
50 #include "utils/blob_dump.h"
52 /*****************************************************
54 * - BLOB_DUMP_PATH : Specify with existing folder name
55 * to dump intermediate blobs into it
56 * - PRINT_GRAPH_INFO : Define it to enable printing
57 * additional information to std output.
59 *****************************************************/
60 // #define BLOB_DUMP_PATH "mkldnn_dump"
61 // #define PRINT_GRAPH_INFO
62 // #define DUMP_AS_TEXT
65 # define DUMP_DIR BLOB_DUMP_PATH
66 # define ENABLE_DUMP(_x) { _x ;}
69 # define ENABLE_DUMP(_x)
72 using namespace mkldnn;
73 using namespace MKLDNNPlugin;
74 using namespace MKLDNNPlugin::cpu;
75 using namespace InferenceEngine;
76 using namespace InferenceEngine::details;
78 void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
82 Replicate(network, extMgr);
87 void MKLDNNGraph::Replicate(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
89 network.getInputsInfo(inputs);
91 THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: No inputs for the topology";
94 // The input layer precision has to be equal to the InputData precision
95 for (const auto& input : inputs) {
96 auto inputLayer = input.second->getInputData()->getCreatorLayer().lock();
97 if (inputLayer) inputLayer->precision = inputLayer->outData[0]->precision;
100 std::unordered_map<CNNLayerPtr, MKLDNNNodePtr> layer2node;
102 auto _parent_port = [] (const DataPtr &data) -> int {
103 auto parent = data->creatorLayer.lock();
104 for (int i = 0; parent->outData.size(); i++)
105 if (data == parent->outData[i])
110 // Replicate All Nodes in topological order
111 for (const auto layer : CNNNetSortTopologically(network)) {
112 CNNLayerPtr _layer = layer;
113 if (layer->type == "Memory" && layer->GetParamAsString("index") == "1") {
114 auto memoryId = layer->GetParamAsString("id");
115 _layer.reset(new CNNLayer({layer->name + "/id=" + memoryId, "MemoryInput", layer->precision}));
116 _layer->params = layer->params;
117 _layer->outData = layer->outData;
120 const MKLDNNNodePtr node(MKLDNNNode::CreateNode(_layer, getEngine(), extMgr));
121 graphNodes.push_back(node);
122 layer2node[layer] = node;
124 for (int port = 0; port < layer->insData.size(); port++) {
125 auto data = layer->insData[port].lock();
126 auto parent_layer = data->creatorLayer.lock();
127 if (!parent_layer) continue; // no parent means that it is input data node (or memory/const layer)
129 auto parent_node = layer2node[parent_layer];
131 MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), port));
133 graphEdges.push_back(edge);
137 std::map<std::string, DataPtr> outputs;
138 network.getOutputsInfo(outputs);
140 for (const auto &output : outputs) {
141 const auto data = output.second;
143 auto parent_layer = data->creatorLayer.lock();
144 auto parent_node = layer2node[parent_layer];
146 CNNLayerPtr layer(new CNNLayer({"out_" + output.first, "Output", data->precision}));
147 layer->insData.push_back(data);
149 const MKLDNNNodePtr node(MKLDNNNode::CreateNode(layer, getEngine(), extMgr));
151 MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), 0));
153 graphEdges.push_back(edge);
155 graphNodes.push_back(node);
156 outputNodes.push_back(node);
157 layer2node[layer] = node;
160 // Replicate input nodes
161 for (const auto& input : inputs) {
162 auto inputLayer = input.second->getInputData()->getCreatorLayer().lock();
163 inputNodes[input.first] = layer2node[inputLayer];
165 // Loading mean images
166 MKLDNNDims outDims(inputNodes[input.first]->getChildEdgeAt(0)->getDims());
167 if (inputs.find(input.first) != inputs.end()) {
168 InputInfo::Ptr ii = inputs[input.first];
169 if (ii && ii->getPreProcess().getNumberOfChannels()) {
170 _meanImages[input.first].Load(outDims, ii);
176 void MKLDNNGraph::InitGraph() {
178 MKLDNNGraphOptimizer optimizer;
179 optimizer.ApplyCommonGraphOptimizations(*this);
184 for (auto &node : graphNodes) {
185 node->initOptimalPrimitiveDescriptor();
189 optimizer.ApplyImplSpecificGraphOptimizations(*this);
197 // Do it before cleanup. Because it will lose original layers information
198 for (auto &graphNode : graphNodes) {
199 auto nodeType = graphNode->getType();
200 if (nodeType == Reorder || nodeType == Output) continue;
202 graphNode->addOriginalLayer(graphNode->getCnnLayer());
203 if (graphNode->getFusedWith().size() || graphNode->getMergeWith().size()) {
204 // Original layer names
205 std::vector<MKLDNNNodePtr> internal = graphNode->getFusedWith();
206 auto &merged = graphNode->getMergeWith();
207 internal.insert(internal.end(), merged.begin(), merged.end());
209 for (auto &sub_node : internal) {
210 graphNode->addOriginalLayer(sub_node->getCnnLayer());
214 if (!config.dumpToDot.empty())
215 dumpToDotFile(config.dumpToDot + "_init.dot");
217 for (auto &graphNode : graphNodes) {
218 graphNode->cleanup();
221 #if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO)
222 for (auto &graphNode : graphNodes) {
223 std::cout << "name: " << graphNode->getName() << " [ ";
224 if (graphNode->parentEdges.size() > 0) {
225 auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc();
226 std::cout << "in: " << prnt_out_desc.getPrecision().name()
227 << "/l=" << prnt_out_desc.getLayout()
230 if (graphNode->childEdges.size() > 0) {
231 auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc();
232 std::cout << "out: " << chld_in_desc.getPrecision().name()
233 << "/l=" << chld_in_desc.getLayout();
235 std::cout << " ]" << std::endl;
239 mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
240 for (auto &graphNode : graphNodes) {
241 if (!graphNode->isConstant())
243 graphNode->execute(stream);
247 void MKLDNNGraph::InitNodes() {
248 for (auto &node : graphNodes) {
249 if (node->getType() == Input && _meanImages.find(node->getName()) != _meanImages.end()) {
250 auto *inputNode = dynamic_cast<MKLDNNInputNode *>(node.get());
252 inputNode->withMeanImage();
254 node->getSupportedDescriptors();
256 node->initSupportedPrimitiveDescriptors();
259 for (auto &node : graphNodes) {
260 node->selectOptimalPrimitiveDescriptor();
264 void MKLDNNGraph::InitEdges() {
265 auto reorderArgs = [](InferenceEngine::TensorDesc parentDesc, InferenceEngine::TensorDesc childDesc) {
266 std::string inArgs, outArgs;
267 if (parentDesc.getPrecision() != childDesc.getPrecision()) {
268 inArgs += (inArgs.empty() ? "" : "_") + std::string(parentDesc.getPrecision().name());
269 outArgs += (outArgs.empty() ? "" : "_") + std::string(childDesc.getPrecision().name());
271 if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
272 inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
273 outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
275 return inArgs + "_" + outArgs;
277 size_t numberOfEdges = graphEdges.size();
278 for (auto i = 0; i < numberOfEdges; i++) {
279 if (graphEdges[i]->needReorder()) {
280 auto &edge = graphEdges[i];
281 std::string layerName = edge->getParent()->getName() + "_" +
282 reorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" +
283 edge->getChild()->getName();
284 CNNLayerPtr layer(new CNNLayer({layerName,
286 edge->getInputDesc().getPrecision()}));
287 MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layer, getEngine()));
288 auto *reorderPtr = dynamic_cast<MKLDNNReorderNode *>(newReorder.get());
290 reorderPtr->setDescs(edge->getInputDesc(), edge->getOutputDesc());
293 auto oIndex = edge->getOutputNum();
294 auto iIndex = edge->getInputNum();
295 if (iIndex < 0 || oIndex < 0)
296 THROW_IE_EXCEPTION << "Cannot create reorder for nodes: "
297 << edge->getParent()->getName() << " and "
298 << edge->getChild()->getName() << ".";
302 MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder, iIndex, 0));
303 MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild(), 0, oIndex));
305 // Add edge for beforeNode
306 beforeNode->getChild()->parentEdges.push_back(beforeNode);
307 edge->getParent()->childEdges.push_back(beforeNode);
309 // Add edge for afterNode
310 afterNode->getParent()->childEdges.push_back(afterNode);
311 edge->getChild()->parentEdges.push_back(afterNode);
313 newReorder->getSupportedDescriptors();
314 newReorder->initSupportedPrimitiveDescriptors();
315 newReorder->selectOptimalPrimitiveDescriptor();
317 graphEdges.push_back(beforeNode);
318 graphEdges.push_back(afterNode);
320 graphNodes.push_back(newReorder);
321 graphEdges.erase(graphEdges.begin() + i);
328 static inline bool isConstOutput(MKLDNNEdgePtr edge) {
329 return edge->getParent()->isConstant() && !edge->getChild()->isConstant();
332 void MKLDNNGraph::AllocateWithReuse() {
333 std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
335 // detect edge clusters which are view on one.
336 for (auto &edge : graphEdges) {
337 MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
338 ? edge->getSharedEdge()
342 for (auto &claster : edge_clasters) {
343 for (auto &element : claster) {
344 if (element == par) {
345 if (std::find(claster.begin(), claster.end(), edge) == claster.end())
346 claster.push_back(edge);
353 edge_clasters.push_back({par, edge});
356 for (auto &claster : edge_clasters) {
357 for (auto &element : claster) {
358 if (element == edge) {
365 edge_clasters.push_back({edge});
369 //======= WA. getSharedEdge() returns not identical edges ============
370 // Will try to merge clasters with matched edges
371 for (auto &edge : graphEdges) {
372 std::vector<decltype(&edge_clasters[0])> to_merge;
374 for (auto &claster : edge_clasters)
375 if (std::find(claster.begin(), claster.end(), edge) != claster.end())
376 to_merge.push_back(&claster);
378 if (to_merge.size() > 1) {
380 auto base_classter = to_merge[0];
381 for (int i = 1; i < to_merge.size(); i++) {
382 base_classter->insert(base_classter->end(),
383 to_merge[i]->begin(), to_merge[i]->end());
384 to_merge[i]->clear();
387 // remove duplicates in merged claster
388 std::sort(base_classter->begin(), base_classter->end());
389 base_classter->erase(std::unique(base_classter->begin(), base_classter->end()),
390 base_classter->end() );
392 // remove empty clasters
393 edge_clasters.erase(std::remove_if(edge_clasters.begin(), edge_clasters.end(),
394 [] ( std::vector<MKLDNNEdgePtr> &cls) { return cls.empty(); }),
395 edge_clasters.end());
398 //======= End of WA ============
400 const int64_t alignment = 32; // 32 bytes
402 std::vector<MemorySolver::Box> boxes(edge_clasters.size());
403 for (int i = 0; i < edge_clasters.size(); i++) {
404 MemorySolver::Box &box = boxes[i];
405 box = { std::numeric_limits<int>::max(), 0, 0, i };
406 for (auto &edge : edge_clasters[i]) {
407 int e_start = edge->getParent()->execIndex;
408 int e_finish = edge->getChild()->execIndex;
410 const BlockingDesc block_desk = edge->getDesc().getBlockingDesc();
412 int64_t e_size = block_desk.getOffsetPadding() + 1; // size in bytes (from begin of data to last element)
413 for (int j = 0; j < block_desk.getBlockDims().size(); j++)
414 e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j];
416 e_size *= edge->getDesc().getPrecision() == Precision::BIN ? 1 : edge->getDesc().getPrecision().size();
418 box.start = std::min(e_start, box.start);
419 box.finish = std::max(e_finish, box.finish);
420 box.size = std::max(e_size, box.size);
423 // Constant data are filled once on load.
424 // So we need it untouchable during all execution time
425 // -1 is a place holder for a max timestamp.
426 bool isConst = false, isOutput = false, isInput = false;
427 for (auto &edge : edge_clasters[i]) {
428 isConst |= isConstOutput(edge);
429 isOutput |= edge->getChild()->getType() == Output;
430 isInput |= edge->getParent()->getType() == Input;
432 // WA. MemoryOutput will keep data in that edge
433 // So need to make it immortal..
434 isConst |= edge->getParent()->getType() == MemoryInput;
437 if (isInput | isConst) box.start = 0;
438 if (isOutput | isConst) box.finish = -1;
440 box.size = div_up(box.size, alignment);
443 MemorySolver memSolver(boxes);
444 size_t total_size = static_cast<size_t>(memSolver.solve()) * alignment;
446 memWorkspace = std::make_shared<MKLDNNMemory>(eng);
447 memWorkspace->Create(MKLDNNMemoryDesc(TensorDesc(Precision::I8, {total_size}, Layout::C)));
448 auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());
450 for (int i = 0; i < edge_clasters.size(); i++) {
452 for (auto &edge : edge_clasters[i]) {
453 if (edge->getStatus() == MKLDNNEdge::Status::NeedAllocation) {
454 int64_t offset = memSolver.getOffset(i);
455 // !! Fallback to individual memory allocation !!
456 // if you like to check infer without reuse just call this function without arguments.
457 edge->allocate(workspace_ptr + offset * alignment); // alignment in byte
461 IE_ASSERT(count == 1);
465 void MKLDNNGraph::Allocate() {
466 // resolve edges. Define which will be a view on others
467 // NeedAllocation - real blob
468 // NotAllocated - view on other blob, peer or in-place
469 for (auto& edge : graphEdges) edge->init();
471 // Allocate memory space for all edges marked with NeedAllocation
474 // Resolve all other edges with status NotAllocated or in-place
475 for (auto& node : graphNodes) node->resolveNotAllocatedEdges();
477 // Check all getters. Should work.
478 for (auto& edge : graphEdges) edge->validate();
481 void MKLDNNGraph::CreatePrimitives() {
482 for (auto& node : graphNodes) {
483 node->createPrimitive();
487 void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::Blob::Ptr &in) {
488 if (!IsReady()) THROW_IE_EXCEPTION<< "Wrong state. Topology not ready.";
490 auto input = inputNodes.find(name);
491 if (input != inputNodes.end()) {
492 MKLDNNDims outDims = input->second->getChildEdgeAt(0)->getDims();
494 const void *ext_data_ptr = in->cbuffer();
495 void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();
497 if (ext_data_ptr != inter_data_ptr) {
498 auto l = in->getTensorDesc().getLayout();
499 if (l == CHW && input->second->getChildEdgeAt(0)->getDims().ndims() == 4)
502 input->second->getChildEdgeAt(0)->getMemory().SetData(
503 MKLDNNExtensionUtils::IEPrecisionToDataType(in->getTensorDesc().getPrecision()),
504 MKLDNNMemory::Convert(l), ext_data_ptr, in->byteSize(), false);
507 // todo: make sure 'name' exists in this map...
508 if (_meanImages.find(name) != _meanImages.end()) {
509 if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
510 _meanImages[name].Subtract(outDims, reinterpret_cast<float *>(inter_data_ptr), in->getTensorDesc().getLayout());
512 THROW_IE_EXCEPTION << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported";
516 THROW_IE_EXCEPTION << "Input blob for infer '" << name << "' doesn't correspond to input in network";
520 void MKLDNNGraph::PullOutputData(BlobMap &out) {
522 THROW_IE_EXCEPTION << "Wrong state. Topology not ready.";
524 for (MKLDNNNodePtr &node : outputNodes) {
525 // remove out_ from node name
526 std::string name = node->getName().substr(4);
527 const MKLDNNMemory& intr_blob = node->getParentEdgeAt(0)->getMemory();
528 if (out.find(name) == out.end()) {
529 // TODO: Create blob from MemoryDesc
530 Blob::Ptr outBlob = make_shared_blob<float>({Precision::FP32, node->getParentEdgeAt(0)->getDims().ToSizeVector(),
531 TensorDesc::getLayoutByDims(node->getParentEdgeAt(0)->getDims().ToSizeVector())},
532 reinterpret_cast<float*>(intr_blob.GetData()));
536 Blob::Ptr &ext_blob = out[name];
538 // TODO: Why we allow allocation of output memory inside Infer call??
539 // Suggestion is to disable this behaviour
540 if (ext_blob->buffer() == nullptr) {
541 SizeVector dims = node->getParentEdgeAt(0)->getDims().ToSizeVector();
542 std::reverse(dims.begin(), dims.end()); // Blobs dims are in reverse order (legacy of OpenVX :-( )
543 ext_blob->Resize(dims);
544 ext_blob->allocate();
547 if (ext_blob->byteSize() != intr_blob.GetSize())
548 THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
549 << ext_blob->size() << "!=" << intr_blob.GetSize()/sizeof(float) << ").";
551 void *ext_blob_ptr = ext_blob->buffer();
552 void *intr_blob_ptr = intr_blob.GetData();
554 // That is the same memory. No need to copy
555 if (ext_blob_ptr == intr_blob_ptr) continue;
557 int MB = intr_blob.GetDims()[0];
558 int MB_to_process = node->batchToProcess();
559 // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
560 if (config.batchLimit)
561 MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
562 size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
564 ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
568 void MKLDNNGraph::Infer(int batch) {
570 THROW_IE_EXCEPTION << "Wrong state. Topology is not ready.";
573 mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
574 for (int i = 0; i < graphNodes.size(); i++) {
578 graphNodes[i]->setDynamicBatchLim(batch);
580 ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i]));
582 if (!graphNodes[i]->isConstant()) {
583 IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask)
584 graphNodes[i]->execute(stream);
587 ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
591 void MKLDNNGraph::VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes) {
592 if (node->temporary) {
596 if (node->permanent) {
600 node->temporary = true;
602 for (size_t i = 0; i < node->getChildEdges().size(); i++) {
603 VisitNode(node->getChildEdgeAt(i)->getChild(), sortedNodes);
606 node->permanent = true;
607 node->temporary = false;
609 sortedNodes.insert(sortedNodes.begin(), node);
612 void MKLDNNGraph::SortTopologically() {
613 std::vector<MKLDNNNodePtr> unsorted;
614 std::vector<MKLDNNNodePtr> sorted;
616 for (int i = 0; i < graphNodes.size(); i++) {
617 MKLDNNNodePtr node = graphNodes[i];
619 node->permanent = false;
620 node->temporary = false;
622 unsorted.push_back(node);
625 while (!unsorted.empty()) {
626 MKLDNNNodePtr node = unsorted.at(0);
627 unsorted.erase(unsorted.begin());
629 VisitNode(node, sorted);
632 for (int i = 0; i < sorted.size(); i++) sorted[i]->execIndex = i;
634 graphNodes.erase(graphNodes.begin(), graphNodes.end());
635 graphNodes.assign(sorted.begin(), sorted.end());
637 // TODO: Sort in/out edges by port index because of backward compatibility
638 // A lot of plugin logic are build on top of assumption that index in
639 // vector childEdges/parentEdges is port number. But that is not
640 // truth anymore. But to keep old logic correct need to simulate ordering.
642 // Make first N (N == port_num) edge indexes are matched with port index
643 for (auto &node : graphNodes) {
645 int port_num = node->inDims.size();
646 std::vector<MKLDNNEdgePtr> res(port_num);
648 for (int i = 0; i < node->parentEdges.size(); i++) {
649 auto edge = node->getParentEdgeAt(i);
650 int port = edge->getOutputNum();
656 node->parentEdges = {res.begin(), res.end()};
659 int port_num = node->outDims.size();
660 std::vector<MKLDNNEdgePtr> res(port_num);
662 for (int i = 0; i < node->childEdges.size(); i++) {
663 auto edge = node->getChildEdgeAt(i);
664 int port = edge->getInputNum();
670 node->childEdges = {res.begin(), res.end()};
675 void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const {
677 std::function<void(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &, const MKLDNNNodePtr&)>
678 getPerfMapFor = [&](std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap, const MKLDNNNodePtr& node) {
679 InferenceEngine::InferenceEngineProfileInfo &pc = perfMap[node->getName()];
680 pc.execution_index = i++;
681 // TODO: Why time counter is signed?
682 pc.cpu_uSec = pc.realTime_uSec = (long long) node->PerfCounter().avg();
683 pc.status = pc.cpu_uSec > 0 ? InferenceEngine::InferenceEngineProfileInfo::EXECUTED
684 : InferenceEngine::InferenceEngineProfileInfo::NOT_RUN;
685 std::string pdType = node->getPrimitiveDescriptorType();
686 size_t typeLen = sizeof(pc.exec_type) / sizeof(pc.exec_type[0]);
687 pdType.copy(pc.exec_type, typeLen, 0);
688 size_t layerTypeLen = sizeof(pc.layer_type) / sizeof(pc.layer_type[0]);
689 node->typeStr.copy(pc.layer_type, layerTypeLen, 0);
691 for (auto& fusedNode : node->fusedWith) {
692 getPerfMapFor(perfMap, fusedNode);
695 for (auto& mergedWith : node->mergedWith) {
696 getPerfMapFor(perfMap, mergedWith);
700 for (int i = 1; i < graphNodes.size(); i++) {
701 getPerfMapFor(perfMap, graphNodes[i]);
704 if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
707 void MKLDNNGraph::setConfig(const Config &cfg) {
711 void MKLDNNGraph::setProperty(const std::map<std::string, std::string>& properties) {
712 config.readProperties(properties);
715 Config MKLDNNGraph::getProperty() {
719 void MKLDNNGraph::getInputBlobs(InferenceEngine::BlobMap &resp) {
720 for (auto &it : inputNodes) {
721 MKLDNNInputNode* node = dynamic_cast<MKLDNNInputNode*>(it.second.get());
722 if (!node || node->isConstant())
724 resp[it.first] = node->getChildEdgeAt(0)->getBlob();
728 void MKLDNNGraph::getOutputBlobs(InferenceEngine::BlobMap &resp) {
729 for (auto &it : outputNodes) {
730 std::string name = it->getName().substr(4);
731 resp[name] = it->getParentEdgeAt(0)->getBlob();
735 void MKLDNNGraph::DropNode(const MKLDNNNodePtr &node) {
736 auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
737 auto& edges = graph.GetEdges();
738 for (auto it = edges.begin(); it != edges.end(); it++) {
746 auto childs = node->childEdges;
747 auto parents = node->parentEdges;
749 for (size_t i = 0; i < parents.size(); i++) {
750 auto p_edge = parents[i].lock();
751 if (!p_edge) continue;
752 auto parent = p_edge->getParent();
753 if (!parent) continue;
755 for (size_t j = 0; j < childs.size(); j++) {
756 if (!childs[j].lock())
758 auto child = childs[j].lock()->getChild();
762 MKLDNNEdgePtr &remEdge = p_edge;
765 inNum = remEdge->getInputNum();
767 removeEdge(*this, remEdge);
769 remEdge = childs[j].lock();
772 outNum = remEdge->getOutputNum();
774 removeEdge(*this, remEdge);
776 MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
777 graphEdges.push_back(newEdge);
778 parent->addEdge(newEdge);
783 void MKLDNNGraph::RemoveDroppedNodes() {
784 auto& nodes = this->GetNodes();
786 auto it = nodes.begin();
788 while (it != nodes.end()) {
789 if ((*it)->isDropped()) {
790 it = nodes.erase(it);
797 void MKLDNNGraph::RemoveDroppedEdges() {
798 auto& edges = this->GetEdges();
800 auto it = edges.begin();
802 while (it != edges.end()) {
803 if ((*it)->isDropped()) {
804 it = edges.erase(it);
811 void MKLDNNGraph::dumpToDotFile(std::string file) const {
814 if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << ".";
816 dump_graph_as_dot(*this, dot);
820 void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) {
821 auto exec_order = std::to_string(node->execIndex);
822 std::string nodeName = node->name;
823 std::replace(nodeName.begin(), nodeName.end(), '\\', '_');
824 std::replace(nodeName.begin(), nodeName.end(), '/', '_');
825 std::replace(nodeName.begin(), nodeName.end(), ' ', '_');
826 std::replace(nodeName.begin(), nodeName.end(), ':', '_');
828 auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size();
829 for (size_t i = 0; i < num_ports; i++) {
830 auto prEdge = node->getParentEdgeAt(i);
831 auto pr = prEdge->getParent();
833 auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_in" + std::to_string(i) + ".ieb";
834 TensorDesc desc = prEdge->getDesc();
835 if (desc.getPrecision() == Precision::BIN)
837 Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData());
839 BlobDumper dumper(blob);
840 if (pr->ext_scales) dumper.withScales(pr->ext_scales);
842 dumper.dumpAsTxt(dump_file);
844 dumper.dump(dump_file);
849 void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) {
850 auto exec_order = std::to_string(node->execIndex);
851 auto nodeName = node->name;
852 std::replace(nodeName.begin(), nodeName.end(), '\\', '_');
853 std::replace(nodeName.begin(), nodeName.end(), '/', '_');
854 std::replace(nodeName.begin(), nodeName.end(), ' ', '_');
855 std::replace(nodeName.begin(), nodeName.end(), ':', '_');
857 auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size();
858 for (size_t i = 0; i < num_ports; i++) {
859 auto childEdge = node->getChildEdgeAt(i);
861 auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_out" + std::to_string(i) + ".ieb";
862 TensorDesc desc = childEdge->getDesc();
863 if (desc.getPrecision() == Precision::BIN)
865 Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData());
867 BlobDumper dumper(blob);
868 if (node->ext_scales) dumper.withScales(node->ext_scales);
871 dumper.dumpAsTxt(dump_file);
873 dumper.dump(dump_file);
878 InferenceEngine::ICNNNetwork::Ptr MKLDNNGraph::dump() const {
879 return dump_graph_as_ie_net(*this);
882 bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const {
883 InputsDataMap inputs;
884 network.getInputsInfo(inputs);
886 CNNLayerSet inputLayers;
887 std::unordered_set<CNNLayer *> allLayers;
892 auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
893 if (secondLayers.empty())
896 bool check_result = true;
897 details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
898 auto type = TypeFromName(layer->type);
899 // This is WA for Tile layer
900 auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
901 if (tileLayer && tileLayer->axis)
906 type != Convolution &&
907 type != Deconvolution &&
908 type != Activation &&
912 type != FullyConnected &&
916 type != Concatenation &&
920 type != BatchNormalization &&
922 check_result = false;
929 InferenceEngine::InferRequestInternal::Ptr
930 MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
931 InferenceEngine::OutputsDataMap networkOutputs) {
932 if (graphs.size() > 1) // streams uses special requests that are not connected to graphs
933 return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs);
935 return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
938 MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
940 const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) {
941 ICNNNetworkStats* pstats = nullptr;
942 StatusCode s = network.getStats(&pstats, nullptr);
943 // we are cloning network if we have statistics and we can transform network.
944 auto clonedNetwork = cloneNet(network);
946 if (s == StatusCode::OK && pstats && !pstats->isEmpty()) {
947 CNNNetworkInt8Normalizer cnnorm;
948 cnnorm.NormalizeNetwork(*clonedNetwork, *pstats);
951 bool ti_proc_ok = !NetPass::CombineRNNSeq(*clonedNetwork) ? NetPass::UnrollTI(*clonedNetwork) : true;
952 ti_proc_ok &= NetPass::UnrollRNN_if(*clonedNetwork, [] (RNNCellBase rnn) -> bool {
953 if (rnn.clip != 0.0f)
955 if ((rnn.cellType == RNNCellBase::GRU || rnn.cellType == RNNCellBase::GRU_LBR) &&
956 rnn.activations != std::vector<std::string> {"sigmoid", "tanh"})
958 if (rnn.cellType == RNNCellBase::LSTM &&
959 rnn.activations != std::vector<std::string> {"sigmoid", "tanh", "tanh"})
964 THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
965 "None TI optimization pattern has been applied successfully";
968 if (cfg.batchLimit > 1) {
969 // check topology for applicability
970 if (!CanProcessDynBatch(*clonedNetwork)) {
971 THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
974 // check whether any (affinity-related) envs are set and if user requested thread binding
975 const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding;
976 // general #threads logic
977 const int env_threads = parallel_get_env_threads();
978 // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual
979 const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
980 const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores);
981 const int threads_per_stream = std::max(1, threads/cfg.throughputStreams);
983 // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams)
984 std::vector<Task::Ptr> tasks;
986 for (int n = 0; n < cfg.throughputStreams; n++) {
987 MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>();
988 graphs.push_back(_graph);
989 auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() {
990 _graph->CreateArena(threads_per_stream);
992 if (bPinningRequested) {
993 _graph->CreateObserver(n, threads_per_stream);
996 _graph->setConfig(cfg);
997 _graph->CreateGraph(*clonedNetwork, extensionManager);
998 if (cfg.throughputStreams > 1) // for streams, each worker thread has it's own graph
999 MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph;
1001 tasks.push_back(task);
1004 if (cfg.throughputStreams > 1) {
1005 // special executor with as many threads as requested #streams, each with it's own initialization task
1006 _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks);
1008 if (cfg.exclusiveAsyncRequests) {
1009 // special case when all InferRequests are muxed into a single queue
1010 ExecutorManager *executorManager = ExecutorManager::getInstance();
1011 _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
1013 _taskExecutor->startTask(tasks[0]);
1014 Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
1016 for (auto t : tasks)
1017 t->checkException();
1020 void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
1021 for (auto g : graphs)
1022 g->setProperty(properties);
1025 void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
1026 auto syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs);
1027 syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
1028 auto asyncRequestImpl = std::make_shared<MKLDNNAsyncInferRequest>(syncRequestImpl, _taskExecutor,
1029 _taskSynchronizer, _callbackExecutor);
1030 asyncRequest.reset(new InferRequestBase<MKLDNNAsyncInferRequest>(asyncRequestImpl),
1031 [](IInferRequest *p) { p->Release(); });
1033 asyncRequestImpl->SetPointerToPublicInterface(asyncRequest);
1035 if (graphs.size() == 1) { // single-stream (legacy/hetero) case - single graph for all requests
1036 auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
1037 if (!mkldnnSyncRequest)
1038 THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
1039 mkldnnSyncRequest->SetGraph(graphs[0]);
1043 void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) {
1044 graphPtr = graphs[0]->dump();