Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / mkldnn_plugin / mkldnn_graph_optimizer.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include <blob_factory.hpp>
6 #include "nodes/mkldnn_reshape_node.h"
7 #include "mkldnn_graph_optimizer.h"
8 #include <nodes/mkldnn_activation_node.h>
9 #include "nodes/mkldnn_pooling_node.h"
10 #include "nodes/mkldnn_eltwise_node.h"
11 #include "nodes/mkldnn_depthwise_node.h"
12 #include "nodes/mkldnn_concat_node.h"
13 #include "nodes/mkldnn_reorder_node.h"
14
15 #include <string>
16 #include <list>
17 #include <memory>
18 #include <set>
19 #include <ie_layers_internal.hpp>
20 #include <nodes/mkldnn_bin_conv_node.h>
21 #include <nodes/mkldnn_quantize_node.h>
22 #include "cpu_isa_traits.hpp"
23
24 using namespace mkldnn;
25 using namespace MKLDNNPlugin;
26 using namespace InferenceEngine;
27
28 MKLDNNGraphOptimizer::MKLDNNGraphOptimizer() {}
29
30 void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
31     MergeGroupConvolution(graph);
32     graph.RemoveDroppedNodes();
33
34     FuseConvolutionAndDepthwise(graph);
35     graph.RemoveDroppedNodes();
36
37     FuseConvolutionAndActivation(graph);
38     graph.RemoveDroppedNodes();
39
40     FuseConvolutionAndDepthwise(graph);
41     graph.RemoveDroppedNodes();
42
43     FuseConvolutionAndDWConvolution(graph);
44     graph.RemoveDroppedNodes();
45
46     FuseBinaryConvolutionAndQuantize(graph);
47     graph.RemoveDroppedNodes();
48
49     FuseBatchNormWithScale(graph);
50     graph.RemoveDroppedNodes();
51
52     FuseFullyConnectedAndActivation(graph);
53     graph.RemoveDroppedNodes();
54
55     RemoveIdentityOperator(graph);
56     graph.RemoveDroppedNodes();
57
58     FuseConvolutionSumAndConvolutionSumActivation(graph);
59     graph.RemoveDroppedNodes();
60
61
62     graph.RemoveDroppedEdges();
63 }
64
65 void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &graph) {
66     RemoveIOScaleShifts(graph);
67     graph.RemoveDroppedNodes();
68
69     DropDoubleReorders(graph);
70     graph.RemoveDroppedNodes();
71
72
73     graph.RemoveDroppedEdges();
74 }
75
76 void MKLDNNGraphOptimizer::MergeGroupConvolution(MKLDNNGraph &graph) {
77     for (auto node : graph.GetNodes()) {
78         // Split with at least 2 Convolutions
79         if (!IsOneOf(node->getType(), {Split}) || node->getChildEdges().size() < 2 ||
80                 !IsOneOf(node->getChildEdgeAt(0)->getChild()->getType(), {Convolution, Convolution_Activation})) {
81             continue;
82         }
83         bool canBeMerged = true;
84
85         auto& split = node;
86
87         auto convInEdge = split->getChildEdgeAt(0);
88         auto conv = convInEdge->getChild();
89         auto convOutEdge = conv->getChildEdgeAt(0);
90
91         auto convType = conv->getType();
92         auto convInDims = convInEdge->getDims();
93         auto convOutDims = convOutEdge->getDims();
94
95         // Convolutions of same the type with Concat as a child
96         for (size_t i = 1; i < split->getChildEdges().size(); i++) {
97             auto childEdge = split->getChildEdgeAt(i);
98             auto child = childEdge->getChild();
99             Type type = child->getType();
100
101             if (convType != type || child->getChildEdgeAt(0)->getChild()->getType() != Concatenation ||
102                     convOutDims != child->getChildEdgeAt(0)->getDims() || child->getChildEdges().size() != 1 ||
103                     convInDims != childEdge->getDims()) {
104                 canBeMerged = false;
105                 break;
106             }
107         }
108
109         if (!canBeMerged) continue;
110
111         // TODO: Rewrite topology optimizer at all. it should be clean and understandable
112         auto concat = conv->getChildEdgeAt(0)->getChild();
113         // Merge and remove Convolution
114         for (size_t i = 1; i < split->getChildEdges().size(); i++) {
115             auto peerInEdge = split->getChildEdgeAt(i);
116             auto peer = peerInEdge->getChild();
117             conv->mergeWith(peer);
118             convInDims[1] += (peerInEdge->getDims())[1];
119             convOutDims[1] += (peer->getChildEdgeAt(0)->getDims())[1];
120             peer->remove();
121         }
122         conv->inDims[0] = convInDims;
123         conv->outDims[0] = convOutDims;
124
125         conv->fuseWith(split);
126         conv->fuseWith(concat);
127
128         graph.DropNode(split);
129         graph.DropNode(concat);
130     }
131 }
132
133 void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
134     auto &graphNodes = graph.GetNodes();
135
136     for (int i = 0; i < graphNodes.size(); i++) {
137         const auto& bn = graphNodes[i];
138         if (bn->getType() == BatchNormalization) {
139             const auto& outputNodes = graph.GetOutputNodes();
140             const std::string node_name = bn->getName();
141             // Check that the node is not output node
142             if (std::find_if(outputNodes.begin(), outputNodes.end(),
143                             [&node_name](const MKLDNNNodePtr& x) {
144                                 return x->getName() == node_name;}) == outputNodes.end()) {
145                 if (bn->getChildEdges().size() == 1) {
146                     auto child = bn->getChildEdgeAt(0)->getChild();
147                     if (child->type == Depthwise && child->getCnnLayer()->type == "ScaleShift") {
148                         bn->fuseWith(child);
149                         graph.DropNode(child);
150                     }
151                 }
152             }
153         }
154     }
155 }
156
157 void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
158     auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
159         for (auto a : algs) {
160             if (alg == a) {
161                 return true;
162             }
163         }
164         return false;
165     };
166
167     auto& graphNodes = graph.GetNodes();
168
169     auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
170         if (!activation->getCnnLayer())
171             return false;
172
173         auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
174
175         return activationNode &&
176             (activationNode->getAlgorithm() == eltwise_relu ||
177             (conv->getCnnLayer()->precision == Precision::FP32 &&
178              isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
179     };
180
181     for (int i = 0; i < graphNodes.size(); i++) {
182         if (graphNodes[i]->getType() == Convolution || graphNodes[i]->getType() == BinaryConvolution) {
183             auto conv = graphNodes[i];
184
185             auto fuse = [&] (MKLDNNNodePtr relu) {
186                 if (graphNodes[i]->getType() != BinaryConvolution)
187                     conv->setType(Convolution_Activation);
188                 conv->fuseWith(relu);
189             };
190
191             if (conv->getChildEdges().size() == 1) {
192                 auto ch1 = conv->getChildEdgeAt(0)->getChild();
193
194                 if (isFusingSupported(conv, ch1)) {
195                     fuse(ch1);
196
197                     if (ch1->getChildEdges().size() == 1) {
198                         auto ch2 = ch1->getChildEdgeAt(0)->getChild();
199
200                         if (isFusingSupported(conv, ch2)) {
201                             fuse(ch2);
202                             graph.DropNode(ch2);
203                         }
204                     }
205                     graph.DropNode(ch1);
206                 } else {
207                     if (ch1->type == Pooling) {
208                         auto pool = ch1;
209                         bool is_max_pool =
210                                 dynamic_cast<PoolingLayer *>(pool->getCnnLayer().get())->_type ==
211                                 PoolingLayer::PoolType::MAX;
212
213                         if (is_max_pool && pool->getChildEdges().size() == 1) {
214                             auto ch2 = pool->getChildEdgeAt(0)->getChild();
215                             if (isFusingSupported(conv, ch2)) {
216                                 fuse(ch2);
217                                 graph.DropNode(ch2);
218                             }
219                         }
220                     }
221                 }
222             }
223         }
224     }
225 }
226
227 void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
228     auto& graphNodes = graph.GetNodes();
229
230     auto isSutableParentNode = [](MKLDNNNodePtr node) {
231         bool isSutableConv = (node->getType() == Convolution || node->getType() == Convolution_Activation) &&
232                              node->getCnnLayer()->precision == Precision::FP32;
233         bool isSutableBinConv = node->getType() == BinaryConvolution;
234         return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1;
235     };
236
237     auto isSutableChildNode = [](MKLDNNNodePtr node) {
238         if (node->getType() != Depthwise)
239             return false;
240
241         if (!node->getCnnLayer())
242             return false;
243
244         auto* depthwiseNode = dynamic_cast<MKLDNNDepthwiseNode *>(node.get());
245         return ((depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_scale_shift && depthwiseNode->isWithBiases()) ||
246                 (depthwiseNode->getAlgorithm() == mkldnn::algorithm::depthwise_prelu));
247     };
248
249     for (int i = 0; i < graphNodes.size(); i++) {
250         auto conv = graphNodes[i];
251         if (!isSutableParentNode(conv)) continue;
252
253         auto depthwise0 = conv->getChildEdgeAt(0)->getChild();
254         if (!isSutableChildNode(depthwise0)) continue;
255
256         conv->fuseWith(depthwise0);
257         if (conv->type != BinaryConvolution)
258             conv->setType(Convolution_Depthwise);
259
260         if (depthwise0->getChildEdges().size() == 1) {
261             auto depthwise1 = depthwise0->getChildEdgeAt(0)->getChild();
262
263             if (isSutableChildNode(depthwise1)) {
264                 conv->fuseWith(depthwise1);
265                 graph.DropNode(depthwise1);
266             }
267         }
268
269         graph.DropNode(depthwise0);
270     }
271 }
272
273 void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
274     auto& graphNodes = graph.GetNodes();
275
276     auto isConvolutionNode = [](MKLDNNNodePtr node) {
277         return node->getType() == Convolution || node->getType() == Convolution_Activation;
278     };
279
280     auto isBinaryConvolutionNode = [](MKLDNNNodePtr node) {
281         return node->getType() == BinaryConvolution;
282     };
283
284     auto is1x1Convolution = [](ConvolutionLayer* layer) {
285         return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1;
286     };
287
288     auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
289         if (isBinaryConvolutionNode(node)) {
290             auto *layer = dynamic_cast<BinaryConvolutionLayer *>(node->getCnnLayer().get());
291
292             bool isSupportedParams = layer->_group == 1;
293             if (!isSupportedParams) return false;
294         } else {
295             auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
296
297             bool isSupportedParams = layer->_group == 1 &&
298                                      ((is1x1Convolution(layer) && layer->_stride[X_AXIS] == 1 &&
299                                        layer->_stride[Y_AXIS] == 1) || !is1x1Convolution(layer)) &&
300                                      (layer->precision == Precision::FP32 || layer->precision == Precision::I8);
301             if (!isSupportedParams) return false;
302         }
303
304         return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
305     };
306
307     auto isSutableChildConvolution = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
308         auto* childLayer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
309
310         if (!isBinaryConvolutionNode(parentNode)) {
311             auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
312             if (parentLayer->precision != childLayer->precision)
313                 return false;
314         }
315
316         auto allPads = getPaddings(*childLayer);
317         bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
318                                  childLayer->_out_depth != 1 &&
319                                  // Depthwise convolution output should be multiple of 8
320                                  childLayer->_kernel[X_AXIS] == 3 && childLayer->_kernel[Y_AXIS] == 3 &&
321                                  allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
322                                  childLayer->_dilation[X_AXIS] == 1 && childLayer->_dilation[Y_AXIS] == 1 &&
323                                  childLayer->_biases != nullptr && childLayer->_biases->size() != 0;
324
325         return isSupportedParams;
326     };
327
328     auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
329         if (isBinaryConvolutionNode(parentNode)) {
330             return true;
331         }
332
333         auto* layer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
334
335         auto inDims = childNode->inDims[0];
336         auto outDims = childNode->outDims[0];
337         int elemSize = MKLDNNExtensionUtils::sizeOfDataType(MKLDNNExtensionUtils::IEPrecisionToDataType(layer->precision));
338
339         int L3_cache_size = mkldnn_get_cache_size(3, false);
340         int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
341         int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
342
343         bool isInt8 = layer->precision == Precision::I8 || layer->precision == Precision::U8;
344         bool isAVX512NotSupported = !mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common);
345
346         return isInt8 ? isAVX512NotSupported : (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
347     };
348
349     for (int i = 0; i < graphNodes.size(); i++) {
350         if (!isConvolutionNode(graphNodes[i]) && !isBinaryConvolutionNode(graphNodes[i])) continue;
351
352         auto parentConvNode = graphNodes[i];
353         if (!isSutableParentConvolution(parentConvNode)) continue;
354
355         auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
356         if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
357
358         if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
359
360         parentConvNode->fuseWith(childConvNode);
361         graph.DropNode(childConvNode);
362     }
363 }
364
365 void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) {
366     auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) {
367         auto& edges = graph.GetEdges();
368         for (auto it = edges.begin(); it != edges.end(); it++) {
369             if ((*it) == edge) {
370                 edges.erase(it);
371                 return;
372             }
373         }
374     };
375
376     auto& graphNodes = graph.GetNodes();
377
378     auto isSutableParentNode = [](MKLDNNNodePtr node) {
379         bool isSutableBinConv = node->getType() == BinaryConvolution;
380         return isSutableBinConv && node->getChildEdges().size() == 1;
381     };
382
383     auto isSutableChildNode = [](MKLDNNNodePtr node) {
384         if (!node->getCnnLayer())
385             return false;
386
387         auto* quantizeLayer = dynamic_cast<QuantizeLayer*>(node->getCnnLayer().get());
388         bool isSutableQuantize = node->getType() == Quantize && quantizeLayer->levels == 2;
389
390         return isSutableQuantize;
391     };
392
393     for (int i = 0; i < graphNodes.size(); i++) {
394         auto parent = graphNodes[i];
395         if (!isSutableParentNode(parent)) continue;
396
397         auto child = parent->getChildEdgeAt(0)->getChild();
398         if (!isSutableChildNode(child)) continue;
399
400         parent->fuseWith(child);
401
402         auto* binConvNode = dynamic_cast<MKLDNNBinaryConvolutionNode*>(parent.get());
403
404         auto parents = child->parentEdges;
405         for (size_t i = 0; i < parents.size(); i++) {
406             auto p_edge = parents[i].lock();
407             if (p_edge->getParent()->getType() == Input) {
408                 InferenceEngine::SizeVector dims;
409                 dims.push_back(binConvNode->getChildEdgeAt(0)->getDims()[1]);
410
411                 auto InputLowBlob = dynamic_cast<TBlob<float>*>(p_edge->getParent()->getCnnLayer()->blobs["custom"].get());
412
413                 auto inputLowData = InputLowBlob->buffer().as<float*>();
414                 int inputLowAxis = p_edge->getDims().ndims() == 1 ? 0 : 1;
415                 bool isInputLowBroadcasted = p_edge->getDims()[inputLowAxis] != dims[0];
416
417                 for (int i = 0; i < dims[0]; i++) {
418                     binConvNode->pushBinarizationThreshold(inputLowData[isInputLowBroadcasted ? 0 : i]);
419                 }
420
421                 break;
422             }
423         }
424
425         for (size_t i = 0; i < parents.size(); i++) {
426             auto p_edge = parents[i].lock();
427             if (p_edge->getParent()->getType() == BinaryConvolution)
428                 continue;
429
430             removeEdge(graph, p_edge);
431         }
432
433         graph.DropNode(child);
434     }
435 }
436
437 /**
438  *  Check if there is a data dependency between parent and child
439  *  BFS starting from parent and comparing with child
440  *
441  * @param parent head of BFS
442  * @param child node we try to find
443  * @return True if child is one of data supplier
444  */
445 static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
446                                const std::shared_ptr<MKLDNNNode> &child) {
447     std::set<MKLDNNNode*> visited;
448     std::list<MKLDNNNode*> nextLayers {parent.get()};
449
450     for (; !nextLayers.empty();) {
451         auto layer = *nextLayers.begin();
452         if (layer == child.get()) return true;
453         for (auto oe : layer->getChildEdges()) {
454             auto nn = oe.lock()->getChild();
455             if (visited.find(nn.get()) == visited.end()) {
456                 nextLayers.push_back(nn.get());
457                 visited.insert(nn.get());
458             }
459         }
460         nextLayers.pop_front();
461     }
462     return false;
463 }
464
465 /*
466  *  Before:
467  *
468  *        ***             ***                   ***             ***
469  *         |               |                     |               |
470  *    +========+       +========+           +========+       +========+
471  *    |  any   |       | conv 2 |           |  any   |       | conv 2 |
472  *    +========+       +========+           +========+       +========+
473  *         |               |                     |               |
474  *      +=====================+               +=====================+
475  *      |         Sum         |      or       |         Sum         |
476  *      +=====================+               +=====================+
477  *                 |                                     |
478  *         +===============+                            ***
479  *         |     Relu      |
480  *         +===============+
481  *                 |
482  *                ***
483  *
484  *  After:
485  *
486  *        ***             ***
487  *         |               |
488  *    +========+       +========+
489  *    |  any   |-------|        |
490  *    +========+       | conv2  |
491  *                     |   +    |
492  *                     |  sum   |
493  *                     |   +    |
494  *                     | [relu] |
495  *                     |        |
496  *                     +========+
497  *                         |
498  *                 +-------+
499  *                 |
500  *                ***
501  */
502
503 void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
504     std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
505
506     auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
507         for (auto a : algs) {
508             if (alg == a) {
509                 return true;
510             }
511         }
512         return false;
513     };
514
515     auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
516         if (!activation->getCnnLayer())
517             return false;
518
519         auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
520
521         return activationNode &&
522             (activationNode->getAlgorithm() == eltwise_relu ||
523             (conv->getCnnLayer()->precision == Precision::FP32 &&
524              isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
525     };
526
527     for (auto &graphNode : graphNodes) {
528         if (graphNode->getType() != Eltwise)
529             continue;
530
531         if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isSum()) continue;
532         if (!std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isUnitScales()) continue;
533
534         auto parent1 = graphNode->getParentEdgeAt(0)->getParent();
535         auto parent2 = graphNode->getParentEdgeAt(1)->getParent();
536         // TODO: Enlarge to several inputs
537         if (graphNode->getParentEdges().size() != 2 ||
538             (parent1->getType() != Convolution && parent1->getType() != BinaryConvolution &&
539              parent2->getType() != Convolution && parent2->getType() != BinaryConvolution))
540             continue;
541
542         auto mergedConv = (parent1->getType() == Convolution || parent1->getType() == BinaryConvolution) ? parent1 : parent2;
543         auto peerNode = (parent1->getType() == Convolution || parent1->getType() == BinaryConvolution) ? parent2 : parent1;
544         if ((peerNode->getType() == Convolution || peerNode->getType() == BinaryConvolution) &&
545             mergedConv->getChildEdges().size() != 1) {
546             mergedConv = parent2;
547             peerNode = parent1;
548         }
549         auto sum = graphNode;
550         auto lastNode = sum;
551
552         bool fuse_allowed = mergedConv->getChildEdges().size() == 1;
553         for (size_t j = 0; fuse_allowed && j < mergedConv->getParentEdges().size(); j++)
554             if (mergedConv->getParentEdgeAt(j)->getParent() == peerNode)
555                 fuse_allowed = false;
556
557         // Fused Conv+Sum prim will be used inplace. That's mean that input blob will
558         // be overwritten. Should verify that all other consumer already read it and
559         // we can spoil input data.
560         // TODO: rewrite once we add "Inplace" reporting mechanism
561         for (auto & edge : peerNode->getChildEdges()) {
562             if (!fuse_allowed)
563                 break;
564             fuse_allowed &= is_data_dependency(edge.lock()->getChild(), sum);
565         }
566         if (!fuse_allowed) continue;
567
568         if (graphNode->getChildEdges().size() == 1 &&
569                 isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) {
570             auto relu_shared = graphNode->getChildEdgeAt(0)->getChild();
571             lastNode = relu_shared;
572             if (mergedConv->getType() != BinaryConvolution)
573                 mergedConv->setType(Convolution_Sum_Activation);
574             mergedConv->fuseWith(sum);
575         } else {
576             if (mergedConv->getType() != BinaryConvolution)
577                 mergedConv->setType(Convolution_Sum);
578         }
579
580         mergedConv->fuseWith(lastNode);
581
582         if (mergedConv->fusedWith.size() > 0 &&
583            (mergedConv->fusedWith[0]->getType() == Convolution || mergedConv->fusedWith[0]->getType() == BinaryConvolution)) {
584             // Merged with DW_conv. Shape may change
585             mergedConv->inDims.push_back(mergedConv->fusedWith[0]->outDims[0]);
586         } else {
587             mergedConv->inDims.push_back(mergedConv->outDims[0]);
588         }
589
590         size_t childIdx = 0;
591         for (childIdx = 0; childIdx < peerNode->getChildEdges().size(); childIdx++) {
592             if (peerNode->getChildEdgeAt(childIdx)->getChild() == sum) {
593                 break;
594             }
595         }
596
597         int peer_port = peerNode->getChildEdgeAt(childIdx)->getInputNum();
598         peerNode->getChildEdgeAt(childIdx)->drop();
599
600         MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv, peer_port, 1));
601         graph.GetEdges().push_back(edgePtr);
602
603         mergedConv->addEdge(edgePtr);
604
605         std::vector<MKLDNNEdgeWeakPtr> edges_to_reconnect = lastNode->getChildEdges();
606         for (auto &edge_w : edges_to_reconnect) {
607             auto edge = edge_w.lock();
608             auto child = edge->getChild();
609             int idxParent = edge->getInputNum();
610             int idxChild = edge->getOutputNum();
611
612             // reconnect after  activation/sum. Port index must be 0
613             IE_ASSERT(idxParent == 0);
614
615             edge->drop();
616
617             MKLDNNEdgePtr newEdge(new MKLDNNEdge(mergedConv, child, idxParent, idxChild));
618             graph.GetEdges().push_back(newEdge);
619             child->addEdge(newEdge);
620         }
621
622         if (lastNode != sum) {
623             lastNode->remove();
624         }
625         sum->remove();
626     }
627 }
628
629 void MKLDNNGraphOptimizer::FuseFullyConnectedAndActivation(MKLDNNGraph &graph) {
630     auto& graphNodes = graph.GetNodes();
631
632     auto isFusingSupported = [&](MKLDNNNodePtr fc, MKLDNNNodePtr activation) {
633         if (!activation->getCnnLayer())
634             return false;
635
636         auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
637
638         // TODO: fuse on fp32 not optimized yet in mkl-dnn
639         return activationNode && fc->getCnnLayer()->precision != Precision::FP32 &&
640             (activationNode->getAlgorithm() == eltwise_relu);
641     };
642
643     for (int i = 0; i < graphNodes.size(); i++) {
644         if (graphNodes[i]->getType() == FullyConnected) {
645             auto fc = graphNodes[i];
646
647             auto fuse = [&] (MKLDNNNodePtr relu) {
648                 fc->setType(FullyConnected_Activation);
649                 fc->fuseWith(relu);
650             };
651
652             if (fc->getChildEdges().size() == 1) {
653                 auto ch1 = fc->getChildEdgeAt(0)->getChild();
654
655                 if (isFusingSupported(fc, ch1)) {
656                     fuse(ch1);
657                     graph.DropNode(ch1);
658                 }
659             }
660         }
661     }
662 }
663
664 void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
665     for (MKLDNNNodePtr& node : graph.GetNodes()) {
666         bool toDrop = false;
667
668         if (node->getType() == Power) {
669             PowerLayer* l = dynamic_cast<PowerLayer*>(node->getCnnLayer().get());
670
671             if (l->power == 1.0f && l->scale == 1.0f && l->offset == 0.0f) toDrop = true;
672         }
673
674         if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
675             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
676
677             if (l->_weights == nullptr && l->_biases == nullptr) toDrop = true;
678         }
679
680         if (node->getType() == Copy) toDrop = true;
681
682         if (toDrop) graph.DropNode(node);
683     }
684 }
685
686 void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
687     std::set<MKLDNNNodePtr> processed;
688     std::vector<MKLDNNNodePtr> newNodes;
689     for (MKLDNNNodePtr& node : graph.GetNodes()) {
690         if (processed.find(node) == processed.end() && node->getType() == Reorder
691             && node->getChildEdges().size() == 1
692             && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
693             auto nextNode = node->getChildEdgeAt(0)->getChild();
694             MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
695             MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
696
697             auto scales = n->_scales;
698
699             if (n->_scales != nullptr && nn->_scales != nullptr) {
700                 THROW_IE_EXCEPTION << "Merging scales of two subsequent reorders is unsupported yet";
701             } else {
702                 if (scales == nullptr) {
703                     scales = nn->_scales;
704                 }
705             }
706
707             MKLDNNNodePtr p = n->getParentEdgeAt(0)->getParent();
708             MKLDNNNodePtr c = nn->getChildEdgeAt(0)->getChild();
709
710             auto oldEdgeNum = n->getParentEdgeAt(0)->getInputNum();
711
712             graph.DropNode(node);
713             graph.DropNode(nextNode);
714
715             processed.insert(node);
716             processed.insert(nextNode);
717
718             MKLDNNEdgePtr edge;
719             for (auto cur : p->getChildEdgesAtPort(oldEdgeNum)) {
720                 if (cur->getChild() == c)
721                     edge = cur;
722             }
723             if (!edge) THROW_IE_EXCEPTION << "Inappropriate graph processing";
724
725
726             std::string layerName = edge->getParent()->getName() + "_ScaleReorder_" + edge->getChild()->getName();
727             CNNLayerPtr layer(new CNNLayer({layerName,
728                                             "Reorder",
729                                             n->getInput().getPrecision()}));
730             MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layer, graph.getEngine()));
731             auto *reorderPtr = dynamic_cast<MKLDNNReorderNode *>(newReorder.get());
732             if (reorderPtr) {
733                 reorderPtr->setDescs(n->getInput(), nn->getOutput());
734                 reorderPtr->_scales = scales;
735             }
736
737             // new !!!
738             auto oIndex = edge->getOutputNum();
739             auto iIndex = edge->getInputNum();
740             if (iIndex < 0 || oIndex < 0)
741                 THROW_IE_EXCEPTION << "Cannot create reorder for nodes: "
742                                    << edge->getParent()->getName() << " and "
743                                    << edge->getChild()->getName() << ".";
744             edge->drop();
745
746             MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder, iIndex, 0));
747             MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild(), 0, oIndex));
748
749             // Add edge for beforeNode
750             beforeNode->getChild()->parentEdges.push_back(beforeNode);
751             edge->getParent()->childEdges.push_back(beforeNode);
752
753             // Add edge for afterNode
754             afterNode->getParent()->childEdges.push_back(afterNode);
755             edge->getChild()->parentEdges.push_back(afterNode);
756
757             newReorder->getSupportedDescriptors();
758             newReorder->initSupportedPrimitiveDescriptors();
759             newReorder->selectOptimalPrimitiveDescriptor();
760
761             graph.GetEdges().push_back(beforeNode);
762             graph.GetEdges().push_back(afterNode);
763
764             // Just to check accordance
765             afterNode->getDesc();
766             beforeNode->getDesc();
767
768             newNodes.push_back(newReorder);
769             graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end());
770         }
771     }
772     for (MKLDNNNodePtr& node : newNodes) {
773         graph.GetNodes().push_back(node);
774     }
775 }
776
777 void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
778     for (MKLDNNNodePtr& node : graph.GetNodes()) {
779         if (node->getType() == Depthwise && node->getCnnLayer()->type == "ScaleShift") {
780             ScaleShiftLayer* l = dynamic_cast<ScaleShiftLayer*>(node->getCnnLayer().get());
781
782             auto cur = l->insData[0].lock();
783             if (cur == nullptr) {
784                 THROW_IE_EXCEPTION << "[MKLDNN] error - invalid input data";
785             }
786             if (cur->precision != l->outData[0]->precision) {
787                 if (node->name.find("_iScaleShift_") != std::string::npos) {
788                     auto child = node->childEdges[0].lock()->getChild();
789                     if (child->type == Reorder) {
790                         MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(child.get());
791                         if (rn != nullptr) {
792                             rn->_scales = l->_weights;
793                             graph.DropNode(node);
794                         }
795                     } else {
796                         THROW_IE_EXCEPTION << "Strange case. No Reorder after iScaleShift";
797                     }
798                 } else if (node->name.find("_oScaleShift_") != std::string::npos) {
799                     auto parent = node->parentEdges[0].lock()->getParent();
800
801                     if (parent->type == Reorder) {
802                         MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(parent.get());
803                         if (rn != nullptr) {
804                             rn->_scales = l->_weights;
805                             graph.DropNode(node);
806                         }
807                     } else {
808                         THROW_IE_EXCEPTION << "Strange case. No Reorder before oScaleShift";
809                     }
810                 }
811             }
812         }
813     }
814 }
815
816 bool MKLDNNGraphOptimizer::IsOneOf(Type type, std::vector<Type> types) {
817     for (auto tp : types) {
818         if (type == tp) {
819             return true;
820         }
821     }
822     return false;
823 }
824
825