36cefe3fe91171412b2785f1babcd3974ef4cb2f
[platform/upstream/dldt.git] / inference-engine / src / cldnn_engine / cldnn_graph.cpp
1 // Copyright (C) 2018-2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include <list>
6 #include <set>
7 #include <unordered_set>
8 #include <sstream>
9 #include <api/cldnn.hpp>
10 #include <api/network.hpp>
11 #include <api/profiling.hpp>
12 #include <api/custom_gpu_primitive.hpp>
13 #include <chrono>
14 #include <cmath>
15 #include <algorithm>
16 #include "cldnn_graph.h"
17 #include "simple_math.h"
18 #include <description_buffer.hpp>
19 #include <cldnn/cldnn_config.hpp>
20 #include <graph_tools.hpp>
21 #include <ie_layers_internal.hpp>
22 #include <net_pass.h>
23 #include "cldnn_infer_request.h"
24 #include <threading/ie_executor_manager.hpp>
25 #include "details/caseless.hpp"
26 #include <fstream>
27 #include <utility>
28 #include <sys/types.h>
29 #include <sys/stat.h>
30 #include <exec_graph_info.hpp>
31
32 using namespace InferenceEngine;
33 using namespace InferenceEngine::details;
34
35 namespace CLDNNPlugin {
36
37 CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, gpu::ClContext::Ptr context, Config config, uint16_t stream_id)
38     : m_context(context)
39     , m_networkName(network.getName())
40     , m_config(config)
41     , m_stream_id(stream_id) {
42     m_program = std::make_shared<Program>(network, GetEngine(), m_config);
43     Build();
44 }
45
46 CLDNNGraph::CLDNNGraph(std::shared_ptr<CLDNNGraph> graph, uint16_t stream_id)
47         : m_context(graph->m_context)
48         , m_program(graph->m_program)
49         , m_networkName(graph->m_networkName)
50         , m_config(graph->m_config)
51         , m_stream_id(stream_id) {
52     Build();
53 }
54
55 void CLDNNGraph::UpdateLayersMaps() {
56     primitiveIDs = m_program->primitiveIDs;
57     primitivesToIRLayersMap = m_program->primitivesToIRLayersMap;
58     IRToNgraphLayersMap = m_program->IRToNgraphLayersMap;
59     prevPrimitiveIDs = m_program->prevPrimitiveIDs;
60     profilingIDs = m_program->profilingIDs;
61     perfMap = m_program->perfMap;
62     outputDims = m_program->outputDims;
63 }
64
65 void CLDNNGraph::Build() {
66     UpdateLayersMaps();
67
68     if (GetMaxDynamicBatchSize() > 1) {
69         int m_bv_sz = m_program->GetMaxBatchSizeForSingleProgram();
70         for (int b = m_bv_sz - 1; b >= 0; b--) {
71             auto network = BuildNetwork(m_program->getCompiledProgram(b));
72             m_networks.insert(m_networks.begin(), network);
73             GetEngine()->release_pending_memory(network->get_id());
74         }
75     } else {
76         auto network = BuildNetwork(m_program->getCompiledProgram());
77         m_networks.emplace_back(network);
78         GetEngine()->release_pending_memory(network->get_id());
79     }
80
81     UpdateImplementationsMap();
82 }
83
84 std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
85     auto network = std::make_shared<cldnn::network>(*program, m_stream_id);
86
87     if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
88         static int net_id = 0;
89         auto steps_info = network->get_optimization_steps_info();
90         size_t step_idx = 0;
91         for (auto& step : steps_info) {
92             CNNNetwork net(GetExecGraphInfoByPrimitivesInfo(step.second, true));
93             net.serialize(m_config.graph_dumps_dir + std::to_string(net_id) + "_" +
94                           std::to_string(step_idx) + "_" + step.first + "_graph.xml");
95             step_idx++;
96         }
97         net_id++;
98     }
99
100     return network;
101 }
102
103 InferenceEngine::ICNNNetwork::Ptr CLDNNGraph::GetExecGraphInfoByPrimitivesInfo(std::vector<cldnn::primitive_info>& primitives_info,
104                                                                                bool filter_const_primitives) {
105     auto net = std::make_shared<details::CNNNetworkImpl>();
106     net->setPrecision(Precision::FP32);
107     net->setName("runtime_gpu_graph");
108     if (m_config.useProfiling) {
109         try {
110             // Update may throw an exception for step-by-step runtime graph dump,
111             // since network->get_executed_primitives() method can't be called before network execution
112             UpdatePerfStatistics();
113         } catch (std::exception&) {
114         }
115     }
116
117     std::vector<std::pair<cldnn::primitive_info, CNNLayerPtr>> node2layer;
118
119     auto data_type_to_precision = [](cldnn::data_types dt) {
120         switch (dt) {
121             case cldnn::data_types::bin: return Precision::BIN;
122             case cldnn::data_types::f32: return Precision::FP32;
123             case cldnn::data_types::f16: return Precision::FP16;
124             case cldnn::data_types::i32: return Precision::I32;
125             case cldnn::data_types::i64: return Precision::I64;
126             case cldnn::data_types::u8: return Precision::U8;
127             case cldnn::data_types::i8: return Precision::I8;
128             default: return Precision::UNSPECIFIED;
129         }
130     };
131
132     auto to_IE_type_name = [](const std::string& cldnn_name) -> std::string{
133         static std::map<std::string, std::string> type_n2l {
134                 { "activation", "Activation" },
135                 { "arg_max_min", "ArgMax" },
136                 { "average_unpooling", "AverageUnpooling" },
137                 { "batch_norm", "BatchNormalization" },
138                 { "binary_convolution", "BinaryConvolution" },
139                 { "border", "Pad" },
140                 { "concatenation", "Concat" },
141                 { "convolution", "Convolution" },
142                 { "deformable_convolution", "DeformableConvolution" },
143                 { "crop", "Crop" },
144                 { "custom_gpu_primitive", "CustomGPUPrimitive" },
145                 { "data", "Const" },
146                 { "deconvolution", "Deconvolution" },
147                 { "depth_to_space", "DepthToSpace" },
148                 { "detection_output", "DetectionOutput" },
149                 { "eltwise", "Eltwise" },
150                 { "fully_connected", "FullyConnected" },
151                 { "gather", "Gather" },
152                 { "gemm", "Gemm" },
153                 { "input_layout", "Input" },
154                 { "lrn", "LRN" },
155                 { "lstm", "LSTM" },
156                 { "lstm_elt", "LSTM_Eltwise" },
157                 { "lstm_gemm", "LSTM_Gemm" },
158                 { "mvn", "MVN" },
159                 { "normalize", "Normalize" },
160                 { "permute", "Permute" },
161                 { "pooling", "Pooling" },
162                 { "prior_box", "PriorBox" },
163                 { "proposal", "Proposal" },
164                 { "quantize", "Quantize" },
165                 { "region_yolo", "RegionYolo" },
166                 { "reorder", "Reorder" },
167                 { "reorg_yolo", "ReorgYolo" },
168                 { "reshape", "Reshape" },
169                 { "reverse_sequence", "ReverseSequence" },
170                 { "roi_pooling", "ROIPooling" },
171                 { "scale", "ScaleShift" },
172                 { "shuffle_channels", "ShuffleChannels" },
173                 { "softmax", "SoftMax" },
174                 { "split", "Split" },
175                 { "strided_slice", "StridedSlice" },
176                 { "tile", "Tile" },
177                 { "resample", "Resample" },
178                 { "interp", "Interp" },
179                 { "reduce_max", "ReduceMax" },
180                 { "reduce_min", "ReduceMin" },
181                 { "reduce_mean", "ReduceMean" },
182                 { "reduce_prod", "ReduceProd" },
183                 { "reduce_sum", "ReduceSum" },
184                 { "reduce_and", "ReduceAnd" },
185                 { "reduce_or", "ReduceOr" },
186                 { "reduce_sum_square", "ReduceSumSquare" },
187                 { "reduce_l1", "ReduceL1" },
188                 { "reduce_l2", "ReduceL2" },
189                 { "reduce_log_sum", "ReduceLogSum" },
190                 { "reduce_log_sum_exp", "ReduceLogSumExp" }
191         };
192
193         if (type_n2l.find(cldnn_name) != type_n2l.end())
194             return type_n2l.at(cldnn_name);
195
196         return cldnn_name;
197     };
198
199     auto concat_strings = [](std::vector<std::string> strs, char sep) -> std::string {
200         if (strs.empty())
201             return "";
202
203         std::string res = strs[0];
204         for (size_t i = 1; i < strs.size(); i++) {
205             res += sep + strs[i];
206         }
207
208         return res;
209     };
210
211     auto split_string = [](std::string src, std::string delimiter = ",") -> std::vector<std::string> {
212         std::vector<std::string> tokens;
213         std::string tokenBuf;
214         size_t prev = 0, pos = 0, srcLength = src.length(), delimLength = delimiter.length();
215         do {
216             pos = src.find(delimiter, prev);
217             if (pos == std::string::npos) {
218                 pos = srcLength;
219             }
220             tokenBuf = src.substr(prev, pos - prev);
221             if (!tokenBuf.empty()) {
222                 tokens.push_back(tokenBuf);
223             }
224             prev = pos + delimLength;
225         } while (pos < srcLength && prev < srcLength);
226
227         return tokens;
228     };
229
230     auto remove_type_from_name = [](const std::string& name) -> std::string {
231         auto it = std::find(name.begin(), name.end(), ':');
232         if (it == name.end() || (it + 1) == name.end())
233             return name;
234
235         return std::string((it+1), name.end());
236     };
237
238     auto find_origin_layers = [&](const std::string& name) -> std::vector<std::string> {
239         if (primitivesToIRLayersMap.find(name) == primitivesToIRLayersMap.end())
240             return {};
241
242         auto cnn_names = primitivesToIRLayersMap.at(name);
243         std::vector<std::string> res;
244
245         for (auto& cnn_name : cnn_names) {
246             if (IRToNgraphLayersMap.find(cnn_name) != IRToNgraphLayersMap.end()) {
247                 auto ngraph_names = split_string(IRToNgraphLayersMap.at(cnn_name));
248                 res.insert(res.end(), ngraph_names.begin(), ngraph_names.end());
249             } else {
250                 res.push_back(cnn_name);
251             }
252         }
253         return res;
254     };
255
256     auto create_layer = [&](const cldnn::primitive_info& prim_info) -> CNNLayer::Ptr {
257         CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::UNSPECIFIED}));
258
259         layer->name = remove_type_from_name(prim_info.original_id);
260         layer->type = to_IE_type_name(prim_info.type_id);
261         layer->precision = data_type_to_precision(prim_info.output_layout.data_type);
262         std::vector<std::string> originalNames{find_origin_layers(prim_info.original_id)};
263         for (auto& fused_id : prim_info.c_fused_ids) {
264             for (auto& origin_id : find_origin_layers(fused_id)) {
265                 if (std::find(originalNames.begin(), originalNames.end(), origin_id) == originalNames.end())
266                     originalNames.push_back(origin_id);
267             }
268         }
269
270         layer->params[ExecGraphInfoSerialization::ORIGINAL_NAMES] = concat_strings(originalNames, ',');
271         layer->params[ExecGraphInfoSerialization::IMPL_TYPE] = prim_info.kernel_id;
272         layer->params[ExecGraphInfoSerialization::OUTPUT_PRECISIONS] = layer->precision.name();
273         std::string exec_time = "not_executed";
274         if (perfMap.find(prim_info.original_id) != perfMap.end()) {
275             auto perfCounter = perfMap.at(prim_info.original_id).second;
276             if (perfCounter.num > 0) {
277                 exec_time = std::to_string(perfCounter.realTime_avg());
278             }
279         }
280
281         layer->params[ExecGraphInfoSerialization::PERF_COUNTER] = exec_time;
282         layer->params[ExecGraphInfoSerialization::OUTPUT_LAYOUTS] = prim_info.layout_str;
283         layer->params[ExecGraphInfoSerialization::EXECUTION_ORDER] = std::to_string(prim_info.exec_id);
284
285         node2layer.emplace_back(prim_info, layer);
286
287         size_t in_size = prim_info.c_dependencies.size();
288
289         if (filter_const_primitives) {
290             // Decrease expected dependencies count if there is a const input without original id in the IR
291             for (auto& dep : prim_info.c_dependencies) {
292                 auto it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
293                     return entry.original_id == dep;
294                 });
295
296                 if (it == primitives_info.end())
297                     --in_size;
298
299                 if (it->type_id == "data") {
300                     std::vector<std::string> childOriginalNames{find_origin_layers(prim_info.original_id)};
301                     --in_size;
302                 }
303             }
304         }
305         layer->insData.resize(in_size);
306         layer->outData.resize(prim_info.c_users.size());
307
308         return layer;
309     };
310
311     if (filter_const_primitives) {
312         for (auto& pi : primitives_info) {
313             // extract mutable_data primitives and connect it's dependencies and users directly
314             if (pi.type_id == "mutable_data") {
315                 if (pi.c_dependencies.size() == 1 && !pi.c_users.empty()) {
316                     auto dep = pi.c_dependencies[0];
317                     auto users = pi.c_users;
318                     auto it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
319                         return entry.original_id == dep;
320                     });
321                     if (it == primitives_info.end())
322                         continue;
323
324                     auto& dep_users = it->c_users;
325                     // Remove mutable data from users list
326                     dep_users.erase(std::find_if(dep_users.begin(), dep_users.end(), [&](std::string user_id) {
327                         return user_id == pi.original_id;
328                     }));
329
330                     // Add mutable data users to it's dependency users
331                     dep_users.insert(dep_users.end(), users.begin(), users.end());
332
333                     for (auto& user : users) {
334                         it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
335                             return entry.original_id == user;
336                         });
337                         if (it == primitives_info.end())
338                             continue;
339
340                         for (auto& d : it->c_dependencies) {
341                             if (d == pi.original_id)
342                                 d = dep;
343                         }
344                     }
345                 }
346             }
347         }
348     }
349
350     for (auto& pi : primitives_info) {
351         if (filter_const_primitives) {
352             // Skip const inputs
353             if (pi.type_id == "data") {
354                 continue;
355             }
356
357             // Skip mutable_data
358             if (pi.type_id == "mutable_data" &&
359                 pi.c_dependencies.size() == 1 &&
360                 !pi.c_users.empty()) {
361                 continue;
362             }
363         }
364         auto layer = create_layer(pi);
365         net->addLayer(layer);
366     }
367
368     auto desc_from_layout = [&](cldnn::layout layout) -> TensorDesc {
369         Precision precision = data_type_to_precision(layout.data_type);
370         SizeVector dims;
371         Layout l = Layout::NCHW;
372         auto size = layout.size;
373         if (layout.format.dimension() == 4) {
374             dims = {static_cast<size_t>(size.batch[0]),
375                     static_cast<size_t>(size.feature[0]),
376                     static_cast<size_t>(size.spatial[1]),
377                     static_cast<size_t>(size.spatial[0])};
378         } else if (layout.format.dimension() == 5) {
379             dims = {static_cast<size_t>(size.batch[0]),
380                     static_cast<size_t>(size.feature[0]),
381                     static_cast<size_t>(size.spatial[2]),
382                     static_cast<size_t>(size.spatial[1]),
383                     static_cast<size_t>(size.spatial[0])};
384             l = Layout::NCDHW;
385         } else if (layout.format.dimension() == 6) {
386             dims = {static_cast<size_t>(size.batch[0]),
387                     static_cast<size_t>(size.feature[0]),
388                     static_cast<size_t>(size.spatial[3]),
389                     static_cast<size_t>(size.spatial[2]),
390                     static_cast<size_t>(size.spatial[1]),
391                     static_cast<size_t>(size.spatial[0])};
392             // Should be NC?DHW but there is no such layout yet
393             l = Layout::BLOCKED;
394         }
395         TensorDesc dst{precision, dims, l};
396         return dst;
397     };
398
399     for (auto& pair : node2layer) {
400         auto pi = pair.first;
401         auto layer = pair.second;
402         auto user_ids = pi.c_users;
403         for (int i = 0; i < user_ids.size(); i++) {
404             auto it = std::find_if(node2layer.begin(), node2layer.end(), [&](std::pair<cldnn::primitive_info, CNNLayerPtr>& entry) {
405                 return entry.first.original_id == user_ids[i];
406             });
407
408             if (it == node2layer.end())
409                 continue;
410
411             auto& child_layer = it->second;
412
413             DataPtr data;
414             if (i < layer->outData.size()) {
415                 std::string data_name = pi.original_id + "_out" + std::to_string(i);
416                 layer->outData[i] = std::make_shared<Data>(data_name, desc_from_layout(pi.output_layout));
417                 data = layer->outData[i];
418                 data->getCreatorLayer() = layer;
419             } else {
420                 data = layer->outData[0];
421             }
422
423             int in_port_id = 0;
424             for (auto& dep : it->first.c_dependencies) {
425                 if (filter_const_primitives) {
426                     auto it = std::find_if(node2layer.begin(), node2layer.end(), [&](std::pair<cldnn::primitive_info, CNNLayerPtr>& entry) {
427                         return entry.first.original_id == dep;
428                     });
429
430                     if (it == node2layer.end())
431                         continue;
432                 }
433
434                 if (dep == pi.original_id && child_layer->insData[in_port_id].lock() == nullptr) {
435                     data->getInputTo()[child_layer->name] = child_layer;
436                     child_layer->insData[in_port_id] = data;
437                     break;
438                 }
439                 in_port_id++;
440             }
441         }
442     }
443     // Specify inputs data
444     for (auto& pair : node2layer) {
445         auto pi = pair.first;
446         auto layer = pair.second;
447         if (pi.c_dependencies.size() != 0)
448             continue;
449
450         auto in_info = std::make_shared<InputInfo>();
451         if (layer->outData.empty())
452             continue;
453
454         auto dt = layer->outData[0];
455         auto tensor_desc = desc_from_layout(pi.output_layout);
456
457         dt->setDims(tensor_desc.getDims());
458         dt->setPrecision(tensor_desc.getPrecision());
459         dt->setLayout(tensor_desc.getLayout());
460
461         in_info->setInputData(dt);
462         net->setInputInfo(in_info);
463     }
464
465     return net;
466 }
467
468 void CLDNNGraph::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) {
469     auto primitives_info = GetNetwork()->get_primitives_info();
470     graphPtr = GetExecGraphInfoByPrimitivesInfo(primitives_info, true);
471 }
472
473
474 void CLDNNGraph::UpdatePerfStatistics() {
475     if (GetNetworksCount() == 0) {
476         return;
477     }
478
479     // Collect timings
480     auto collectTimings = [](cldnn::instrumentation::profiling_info& cldnnInfo, PerfCounter& pc) {
481         for (auto &interval : cldnnInfo.intervals) {
482             using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
483             auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
484
485             if (interval.name == "submission") {
486                 pc.cpu_uSec += count;
487             } else if (interval.name == "executing") {
488                 pc.realTime_uSec += count;
489             } else if (interval.name == "duration") {  // "duration" is used for CPU layers
490                 pc.cpu_uSec += count;
491
492                 if (pc.num == 0)
493                     pc.isCPU = true;
494             }
495         }
496     };
497
498     std::map<cldnn::primitive_id, cldnn::event> executedPrimitives = GetNetwork()->get_executed_primitives();
499     auto allPrimitives = GetNetwork()->get_all_primitives();
500
501     // Get profiling info for all layers
502     for (auto &profiledID : profilingIDs) {
503         auto pcIter = perfMap.find(profiledID);
504
505         if (pcIter == perfMap.end())  continue;
506
507         auto execIter = executedPrimitives.find(profiledID);
508         auto& perfCount = pcIter->second.second;
509         // Change status if layer wasn't executed by cldnn engine
510         if (execIter == executedPrimitives.end()) {
511             if (perfCount.num == 0) {
512                 perfCount.status = InferenceEngineProfileInfo::OPTIMIZED_OUT;
513             }
514             continue;
515         }
516
517         auto event = execIter->second;
518         executedPrimitives.erase(execIter);
519
520         cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event.get_profiling_info()};
521
522         collectTimings(cldnnInfo, perfCount);
523         perfCount.num++;
524     }
525
526     for (auto &executedID : executedPrimitives) {
527         auto pcIter = perfMap.find(executedID.first);
528         if (pcIter == perfMap.end()) {
529             perfMap[executedID.first].first = executedID.first;
530             pcIter = perfMap.find(executedID.first);
531             auto& perfCount = pcIter->second.second;
532
533             cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second.get_profiling_info()};
534
535             collectTimings(cldnnInfo, perfCount);
536             perfCount.num++;
537         }
538     }
539 }
540
541 bool CLDNNGraph::IsLoaded() const {
542     return GetNetwork() != nullptr;
543 }
544
545 void CLDNNGraph::UpdateImplementationsMap() {
546     if (m_config.useProfiling) {
547         auto extractImplementationFromInfo = [](const std::string& info) -> std::string {
548             std::string def_implementation = "undef";
549             std::string impl_section = "implementation :";
550             std::string::size_type pos = info.find(impl_section);
551             if (pos == std::string::npos) {
552                 return def_implementation;
553             }
554
555             std::string::size_type end_pos = info.find(',', pos);
556             if (end_pos == std::string::npos) {
557                 return def_implementation;
558             }
559
560             std::string::size_type length = end_pos - pos - impl_section.size();
561
562             auto trim = [](const std::string& str) {
563                 size_t first = str.find_first_not_of(' ');
564                 if (std::string::npos == first) {
565                     return str;
566                 }
567                 size_t last = str.find_last_not_of(' ');
568                 return str.substr(first, (last - first + 1));
569             };
570             std::string tmp = trim(info.substr(pos + impl_section.size(), length));
571
572             return tmp.length() > 1 ? tmp : def_implementation;
573         };
574
575         // Parse primitive info and extract implementation name.
576         for (auto& id : profilingIDs) {
577             std::string prim_info = "";
578             try {
579                 prim_info = GetNetwork()->get_primitive_info(id);
580             } catch (std::exception& /*e*/) { }
581
582             implementationsMap.insert({id, extractImplementationFromInfo(prim_info)});
583         }
584     }
585 }
586
587 void CLDNNGraph::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &result) const {
588     bool combinePrimByIRLayers = false;
589     unsigned i = 0;
590     auto allIds = GetNetwork()->get_all_primitive_org_ids();
591     auto executedPrimitives = GetNetwork()->get_executed_primitives();
592     auto primitivesInfo = GetNetwork()->get_primitives_info();
593
594     auto getUpperCaseName = [&](std::string name) {
595         if (name.length() > 0)
596             name[0] = toupper(name[0]);
597         return name;
598     };
599
600     auto getFromProfiling = [&](std::string primId) -> bool {
601         auto perfIter = perfMap.find(primId);
602
603         if (perfIter == perfMap.end())  return false;
604
605         const auto& layerName = perfIter->second.first;
606         if (layerName.length() == 0)  // no layer directly associated
607             return false;
608
609         const auto& perfCounter = perfIter->second.second;
610
611         if (!perfCounter.parentPrimitive.empty() && combinePrimByIRLayers)
612             return false;
613
614         auto& extPerfEntry = result[layerName];
615
616         memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type));
617         if (perfCounter.isCPU) {
618             static const std::string cpuExecType("CPU");
619             cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length());  // Override execType as CPU
620         } else {
621             std::string impl = implementationsMap.at(primId);
622             impl.copy(extPerfEntry.exec_type, impl.length());
623         }
624
625         extPerfEntry.execution_index = i++;
626         extPerfEntry.status = perfCounter.status;
627         extPerfEntry.cpu_uSec = perfCounter.cpu_avg();
628         extPerfEntry.realTime_uSec = perfCounter.realTime_avg();
629
630         if (combinePrimByIRLayers) {
631             std::string kernelId = "";
632             long long kernelTime = 0;  // used for finding the most complex computation kernel in sub_graph for perf stat
633             for (auto &id : profilingIDs) {
634                 auto iter = perfMap.find(id);
635                 if (iter == perfMap.end())  continue;
636
637                 const auto &pc = iter->second.second;
638                 if (id != primId && pc.parentPrimitive == primId) {
639                     extPerfEntry.cpu_uSec += pc.cpu_avg();
640                     extPerfEntry.realTime_uSec += pc.realTime_avg();
641                     if (pc.realTime_avg() > kernelTime) {
642                         kernelTime = pc.realTime_avg();
643                         kernelId = id;
644                     }
645                     allIds.erase(std::find(allIds.begin(), allIds.end(), id));
646                 }
647             }
648             if (!kernelId.empty())
649                 implementationsMap.at(kernelId).copy(extPerfEntry.exec_type, implementationsMap.at(kernelId).length());
650         }
651
652         getUpperCaseName(perfCounter.layerType).copy(extPerfEntry.layer_type, perfCounter.layerType.length());
653         return true;
654     };
655
656     // Step 1. Get all primitives in execution order which was added by clDNNPlugin
657     for (auto& primId : profilingIDs) {
658         getFromProfiling(primId);
659     }
660
661     // Step 2. Find all other primitives which was added while optimization process and executed after
662     for (auto& primId : allIds) {
663         auto perfIter = perfMap.find(primId);
664         if (perfIter == perfMap.end())  continue;
665
666         bool existInProfiling = std::find(profilingIDs.begin(), profilingIDs.end(), primId) != profilingIDs.end();
667         if ((!existInProfiling || (existInProfiling && perfIter->second.first.length() == 0)) &&
668             executedPrimitives.find(primId) != executedPrimitives.end()) {
669             auto event = executedPrimitives.at(primId);
670
671             cldnn::instrumentation::profiling_info cldnnInfo{primId, event.get_profiling_info()};
672
673             // Collect timings
674             long long cpuTime = 0;
675             long long deviceTime = 0;
676
677             for (auto &interval : cldnnInfo.intervals) {
678                 using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
679                 auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
680
681                 if (interval.name == "submission") {
682                     cpuTime += count;
683                 } else if (interval.name == "executing") {
684                     deviceTime += count;
685                 } else if (interval.name == "duration") {  // "duration" is used for CPU layers
686                     cpuTime += count;
687                 }
688             }
689
690             std::string layerName = primId;
691             if (primId.find(":") != std::string::npos) {
692                 layerName = primId.substr(primId.find(":") + 1, primId.length());
693             }
694
695             for (auto& pi : primitivesInfo) {
696                 if (pi.original_id == primId) {
697                     if (pi.type_id == "mutable_data")
698                         continue;
699
700                     auto& extPerfEntry = result[layerName];
701
702                     if (pi.is_cpu) {
703                         static const std::string cpuExecType("CPU");
704                         memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type));
705                         cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length());  // Override execType as CPU
706                     } else {
707                         std::string impl = pi.kernel_id;
708                         impl.copy(extPerfEntry.exec_type, impl.length());
709                     }
710
711                     getUpperCaseName(pi.type_id).copy(extPerfEntry.layer_type, pi.type_id.length());
712                     extPerfEntry.execution_index = i++;
713                     extPerfEntry.status = InferenceEngineProfileInfo::LayerStatus::EXECUTED;
714                     extPerfEntry.cpu_uSec = cpuTime;
715                     extPerfEntry.realTime_uSec = deviceTime;
716
717                     if (pi.type_id == "input_layout") {
718                         const std::string input_string = "Input";
719                         const std::string undef_string = "undef";
720                         input_string.copy(extPerfEntry.layer_type, 256);
721                         undef_string.copy(extPerfEntry.exec_type, 256);
722                     }
723                 }
724             }
725         }
726     }
727
728     // Step 3. Checking primitives which has been deleted from execution order but added by clDNNPlugin
729     for (auto& primId : profilingIDs)
730         if (std::find(allIds.begin(), allIds.end(), primId) == allIds.end()) {
731             getFromProfiling(primId);
732         }
733 }
734
735 std::shared_ptr<cldnn::network> CLDNNGraph::GetNetwork(size_t idx) const {
736     if (idx >= GetNetworksCount())
737         THROW_IE_EXCEPTION << "Unable to find network with id=" << idx << ". Stored networks count: " << GetNetworksCount();
738
739     return m_networks[idx];
740 }
741
742
743 std::string CLDNNGraph::MapOutputName(std::string outName) const {
744     auto networkOutputsIDs = GetNetwork()->get_output_ids();
745     auto allPrimitiveIds = GetNetwork()->get_all_primitives();
746
747     // Find correct output ID. Start with name stored in IR.
748     std::string outputID = primitiveIDs.at(outName);
749     while (std::find(networkOutputsIDs.begin(), networkOutputsIDs.end(), outputID) == networkOutputsIDs.end()) {
750         // If current ID isn't found in cldnn network outputs, get previous primitive id and try again.
751         auto prim = allPrimitiveIds.find(outputID);
752         if (prim == allPrimitiveIds.end()) {
753             THROW_IE_EXCEPTION << "Unknown primitive id " << outputID;
754         }
755
756         if (prevPrimitiveIDs.at(outputID).size() != 1 || prim->second != "_optimized_") {
757             THROW_IE_EXCEPTION << "Unable to find parent for output primitive " << outputID;
758         }
759         outputID = prevPrimitiveIDs.at(outputID)[0];
760     }
761
762     return outputID;
763 }
764
765 InferenceEngine::SizeVector CLDNNGraph::GetOutputSize(std::string outName) const {
766     auto res_output = outputDims.find(outName);
767
768     InferenceEngine::SizeVector sz;
769     if (res_output != outputDims.end())
770         sz = res_output->second;
771     else
772         sz = outputDims.at(primitiveIDs.at(outName));
773
774     return sz;
775 }
776
777 };  // namespace CLDNNPlugin