inference-engine/src/cldnn_engine/cldnn_graph.cpp

   1 // Copyright (C) 2018-2020 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <list>
   6 #include <set>
   7 #include <unordered_set>
   8 #include <sstream>
   9 #include <api/cldnn.hpp>
  10 #include <api/network.hpp>
  11 #include <api/profiling.hpp>
  12 #include <api/custom_gpu_primitive.hpp>
  13 #include <chrono>
  14 #include <cmath>
  15 #include <algorithm>
  16 #include "cldnn_graph.h"
  17 #include "simple_math.h"
  18 #include <description_buffer.hpp>
  19 #include <cldnn/cldnn_config.hpp>
  20 #include <graph_tools.hpp>
  21 #include <ie_layers_internal.hpp>
  22 #include <net_pass.h>
  23 #include "cldnn_infer_request.h"
  24 #include <threading/ie_executor_manager.hpp>
  25 #include "details/caseless.hpp"
  26 #include <fstream>
  27 #include <utility>
  28 #include <sys/types.h>
  29 #include <sys/stat.h>
  30 #include <exec_graph_info.hpp>
  31
  32 using namespace InferenceEngine;
  33 using namespace InferenceEngine::details;
  34
  35 namespace CLDNNPlugin {
  36
  37 CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, gpu::ClContext::Ptr context, Config config, uint16_t stream_id)
  38     : m_context(context)
  39     , m_networkName(network.getName())
  40     , m_config(config)
  41     , m_stream_id(stream_id) {
  42     m_program = std::make_shared<Program>(network, GetEngine(), m_config);
  43     Build();
  44 }
  45
  46 CLDNNGraph::CLDNNGraph(std::shared_ptr<CLDNNGraph> graph, uint16_t stream_id)
  47         : m_context(graph->m_context)
  48         , m_program(graph->m_program)
  49         , m_networkName(graph->m_networkName)
  50         , m_config(graph->m_config)
  51         , m_stream_id(stream_id) {
  52     Build();
  53 }
  54
  55 void CLDNNGraph::UpdateLayersMaps() {
  56     primitiveIDs = m_program->primitiveIDs;
  57     primitivesToIRLayersMap = m_program->primitivesToIRLayersMap;
  58     IRToNgraphLayersMap = m_program->IRToNgraphLayersMap;
  59     prevPrimitiveIDs = m_program->prevPrimitiveIDs;
  60     profilingIDs = m_program->profilingIDs;
  61     perfMap = m_program->perfMap;
  62     outputDims = m_program->outputDims;
  63 }
  64
  65 void CLDNNGraph::Build() {
  66     UpdateLayersMaps();
  67
  68     if (GetMaxDynamicBatchSize() > 1) {
  69         int m_bv_sz = m_program->GetMaxBatchSizeForSingleProgram();
  70         for (int b = m_bv_sz - 1; b >= 0; b--) {
  71             auto network = BuildNetwork(m_program->getCompiledProgram(b));
  72             m_networks.insert(m_networks.begin(), network);
  73             GetEngine()->release_pending_memory(network->get_id());
  74         }
  75     } else {
  76         auto network = BuildNetwork(m_program->getCompiledProgram());
  77         m_networks.emplace_back(network);
  78         GetEngine()->release_pending_memory(network->get_id());
  79     }
  80
  81     UpdateImplementationsMap();
  82 }
  83
  84 std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
  85     auto network = std::make_shared<cldnn::network>(*program, m_stream_id);
  86
  87     if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
  88         static int net_id = 0;
  89         auto steps_info = network->get_optimization_steps_info();
  90         size_t step_idx = 0;
  91         for (auto& step : steps_info) {
  92             CNNNetwork net(GetExecGraphInfoByPrimitivesInfo(step.second, true));
  93             net.serialize(m_config.graph_dumps_dir + std::to_string(net_id) + "_" +
  94                           std::to_string(step_idx) + "_" + step.first + "_graph.xml");
  95             step_idx++;
  96         }
  97         net_id++;
  98     }
  99
 100     return network;
 101 }
 102
 103 InferenceEngine::ICNNNetwork::Ptr CLDNNGraph::GetExecGraphInfoByPrimitivesInfo(std::vector<cldnn::primitive_info>& primitives_info,
 104                                                                                bool filter_const_primitives) {
 105     auto net = std::make_shared<details::CNNNetworkImpl>();
 106     net->setPrecision(Precision::FP32);
 107     net->setName("runtime_gpu_graph");
 108     if (m_config.useProfiling) {
 109         try {
 110             // Update may throw an exception for step-by-step runtime graph dump,
 111             // since network->get_executed_primitives() method can't be called before network execution
 112             UpdatePerfStatistics();
 113         } catch (std::exception&) {
 114         }
 115     }
 116
 117     std::vector<std::pair<cldnn::primitive_info, CNNLayerPtr>> node2layer;
 118
 119     auto data_type_to_precision = [](cldnn::data_types dt) {
 120         switch (dt) {
 121             case cldnn::data_types::bin: return Precision::BIN;
 122             case cldnn::data_types::f32: return Precision::FP32;
 123             case cldnn::data_types::f16: return Precision::FP16;
 124             case cldnn::data_types::i32: return Precision::I32;
 125             case cldnn::data_types::i64: return Precision::I64;
 126             case cldnn::data_types::u8: return Precision::U8;
 127             case cldnn::data_types::i8: return Precision::I8;
 128             default: return Precision::UNSPECIFIED;
 129         }
 130     };
 131
 132     auto to_IE_type_name = [](const std::string& cldnn_name) -> std::string{
 133         static std::map<std::string, std::string> type_n2l {
 134                 { "activation", "Activation" },
 135                 { "arg_max_min", "ArgMax" },
 136                 { "average_unpooling", "AverageUnpooling" },
 137                 { "batch_norm", "BatchNormalization" },
 138                 { "binary_convolution", "BinaryConvolution" },
 139                 { "border", "Pad" },
 140                 { "concatenation", "Concat" },
 141                 { "convolution", "Convolution" },
 142                 { "deformable_convolution", "DeformableConvolution" },
 143                 { "crop", "Crop" },
 144                 { "custom_gpu_primitive", "CustomGPUPrimitive" },
 145                 { "data", "Const" },
 146                 { "deconvolution", "Deconvolution" },
 147                 { "depth_to_space", "DepthToSpace" },
 148                 { "detection_output", "DetectionOutput" },
 149                 { "eltwise", "Eltwise" },
 150                 { "fully_connected", "FullyConnected" },
 151                 { "gather", "Gather" },
 152                 { "gemm", "Gemm" },
 153                 { "input_layout", "Input" },
 154                 { "lrn", "LRN" },
 155                 { "lstm", "LSTM" },
 156                 { "lstm_elt", "LSTM_Eltwise" },
 157                 { "lstm_gemm", "LSTM_Gemm" },
 158                 { "mvn", "MVN" },
 159                 { "normalize", "Normalize" },
 160                 { "permute", "Permute" },
 161                 { "pooling", "Pooling" },
 162                 { "prior_box", "PriorBox" },
 163                 { "proposal", "Proposal" },
 164                 { "quantize", "Quantize" },
 165                 { "region_yolo", "RegionYolo" },
 166                 { "reorder", "Reorder" },
 167                 { "reorg_yolo", "ReorgYolo" },
 168                 { "reshape", "Reshape" },
 169                 { "reverse_sequence", "ReverseSequence" },
 170                 { "roi_pooling", "ROIPooling" },
 171                 { "scale", "ScaleShift" },
 172                 { "shuffle_channels", "ShuffleChannels" },
 173                 { "softmax", "SoftMax" },
 174                 { "split", "Split" },
 175                 { "strided_slice", "StridedSlice" },
 176                 { "tile", "Tile" },
 177                 { "resample", "Resample" },
 178                 { "interp", "Interp" },
 179                 { "reduce_max", "ReduceMax" },
 180                 { "reduce_min", "ReduceMin" },
 181                 { "reduce_mean", "ReduceMean" },
 182                 { "reduce_prod", "ReduceProd" },
 183                 { "reduce_sum", "ReduceSum" },
 184                 { "reduce_and", "ReduceAnd" },
 185                 { "reduce_or", "ReduceOr" },
 186                 { "reduce_sum_square", "ReduceSumSquare" },
 187                 { "reduce_l1", "ReduceL1" },
 188                 { "reduce_l2", "ReduceL2" },
 189                 { "reduce_log_sum", "ReduceLogSum" },
 190                 { "reduce_log_sum_exp", "ReduceLogSumExp" }
 191         };
 192
 193         if (type_n2l.find(cldnn_name) != type_n2l.end())
 194             return type_n2l.at(cldnn_name);
 195
 196         return cldnn_name;
 197     };
 198
 199     auto concat_strings = [](std::vector<std::string> strs, char sep) -> std::string {
 200         if (strs.empty())
 201             return "";
 202
 203         std::string res = strs[0];
 204         for (size_t i = 1; i < strs.size(); i++) {
 205             res += sep + strs[i];
 206         }
 207
 208         return res;
 209     };
 210
 211     auto split_string = [](std::string src, std::string delimiter = ",") -> std::vector<std::string> {
 212         std::vector<std::string> tokens;
 213         std::string tokenBuf;
 214         size_t prev = 0, pos = 0, srcLength = src.length(), delimLength = delimiter.length();
 215         do {
 216             pos = src.find(delimiter, prev);
 217             if (pos == std::string::npos) {
 218                 pos = srcLength;
 219             }
 220             tokenBuf = src.substr(prev, pos - prev);
 221             if (!tokenBuf.empty()) {
 222                 tokens.push_back(tokenBuf);
 223             }
 224             prev = pos + delimLength;
 225         } while (pos < srcLength && prev < srcLength);
 226
 227         return tokens;
 228     };
 229
 230     auto remove_type_from_name = [](const std::string& name) -> std::string {
 231         auto it = std::find(name.begin(), name.end(), ':');
 232         if (it == name.end() || (it + 1) == name.end())
 233             return name;
 234
 235         return std::string((it+1), name.end());
 236     };
 237
 238     auto find_origin_layers = [&](const std::string& name) -> std::vector<std::string> {
 239         if (primitivesToIRLayersMap.find(name) == primitivesToIRLayersMap.end())
 240             return {};
 241
 242         auto cnn_names = primitivesToIRLayersMap.at(name);
 243         std::vector<std::string> res;
 244
 245         for (auto& cnn_name : cnn_names) {
 246             if (IRToNgraphLayersMap.find(cnn_name) != IRToNgraphLayersMap.end()) {
 247                 auto ngraph_names = split_string(IRToNgraphLayersMap.at(cnn_name));
 248                 res.insert(res.end(), ngraph_names.begin(), ngraph_names.end());
 249             } else {
 250                 res.push_back(cnn_name);
 251             }
 252         }
 253         return res;
 254     };
 255
 256     auto create_layer = [&](const cldnn::primitive_info& prim_info) -> CNNLayer::Ptr {
 257         CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::UNSPECIFIED}));
 258
 259         layer->name = remove_type_from_name(prim_info.original_id);
 260         layer->type = to_IE_type_name(prim_info.type_id);
 261         layer->precision = data_type_to_precision(prim_info.output_layout.data_type);
 262         std::vector<std::string> originalNames{find_origin_layers(prim_info.original_id)};
 263         for (auto& fused_id : prim_info.c_fused_ids) {
 264             for (auto& origin_id : find_origin_layers(fused_id)) {
 265                 if (std::find(originalNames.begin(), originalNames.end(), origin_id) == originalNames.end())
 266                     originalNames.push_back(origin_id);
 267             }
 268         }
 269
 270         layer->params[ExecGraphInfoSerialization::ORIGINAL_NAMES] = concat_strings(originalNames, ',');
 271         layer->params[ExecGraphInfoSerialization::IMPL_TYPE] = prim_info.kernel_id;
 272         layer->params[ExecGraphInfoSerialization::OUTPUT_PRECISIONS] = layer->precision.name();
 273         std::string exec_time = "not_executed";
 274         if (perfMap.find(prim_info.original_id) != perfMap.end()) {
 275             auto perfCounter = perfMap.at(prim_info.original_id).second;
 276             if (perfCounter.num > 0) {
 277                 exec_time = std::to_string(perfCounter.realTime_avg());
 278             }
 279         }
 280
 281         layer->params[ExecGraphInfoSerialization::PERF_COUNTER] = exec_time;
 282         layer->params[ExecGraphInfoSerialization::OUTPUT_LAYOUTS] = prim_info.layout_str;
 283         layer->params[ExecGraphInfoSerialization::EXECUTION_ORDER] = std::to_string(prim_info.exec_id);
 284
 285         node2layer.emplace_back(prim_info, layer);
 286
 287         size_t in_size = prim_info.c_dependencies.size();
 288
 289         if (filter_const_primitives) {
 290             // Decrease expected dependencies count if there is a const input without original id in the IR
 291             for (auto& dep : prim_info.c_dependencies) {
 292                 auto it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
 293                     return entry.original_id == dep;
 294                 });
 295
 296                 if (it == primitives_info.end())
 297                     --in_size;
 298
 299                 if (it->type_id == "data") {
 300                     std::vector<std::string> childOriginalNames{find_origin_layers(prim_info.original_id)};
 301                     --in_size;
 302                 }
 303             }
 304         }
 305         layer->insData.resize(in_size);
 306         layer->outData.resize(prim_info.c_users.size());
 307
 308         return layer;
 309     };
 310
 311     if (filter_const_primitives) {
 312         for (auto& pi : primitives_info) {
 313             // extract mutable_data primitives and connect it's dependencies and users directly
 314             if (pi.type_id == "mutable_data") {
 315                 if (pi.c_dependencies.size() == 1 && !pi.c_users.empty()) {
 316                     auto dep = pi.c_dependencies[0];
 317                     auto users = pi.c_users;
 318                     auto it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
 319                         return entry.original_id == dep;
 320                     });
 321                     if (it == primitives_info.end())
 322                         continue;
 323
 324                     auto& dep_users = it->c_users;
 325                     // Remove mutable data from users list
 326                     dep_users.erase(std::find_if(dep_users.begin(), dep_users.end(), [&](std::string user_id) {
 327                         return user_id == pi.original_id;
 328                     }));
 329
 330                     // Add mutable data users to it's dependency users
 331                     dep_users.insert(dep_users.end(), users.begin(), users.end());
 332
 333                     for (auto& user : users) {
 334                         it = std::find_if(primitives_info.begin(), primitives_info.end(), [&](cldnn::primitive_info& entry) {
 335                             return entry.original_id == user;
 336                         });
 337                         if (it == primitives_info.end())
 338                             continue;
 339
 340                         for (auto& d : it->c_dependencies) {
 341                             if (d == pi.original_id)
 342                                 d = dep;
 343                         }
 344                     }
 345                 }
 346             }
 347         }
 348     }
 349
 350     for (auto& pi : primitives_info) {
 351         if (filter_const_primitives) {
 352             // Skip const inputs
 353             if (pi.type_id == "data") {
 354                 continue;
 355             }
 356
 357             // Skip mutable_data
 358             if (pi.type_id == "mutable_data" &&
 359                 pi.c_dependencies.size() == 1 &&
 360                 !pi.c_users.empty()) {
 361                 continue;
 362             }
 363         }
 364         auto layer = create_layer(pi);
 365         net->addLayer(layer);
 366     }
 367
 368     auto desc_from_layout = [&](cldnn::layout layout) -> TensorDesc {
 369         Precision precision = data_type_to_precision(layout.data_type);
 370         SizeVector dims;
 371         Layout l = Layout::NCHW;
 372         auto size = layout.size;
 373         if (layout.format.dimension() == 4) {
 374             dims = {static_cast<size_t>(size.batch[0]),
 375                     static_cast<size_t>(size.feature[0]),
 376                     static_cast<size_t>(size.spatial[1]),
 377                     static_cast<size_t>(size.spatial[0])};
 378         } else if (layout.format.dimension() == 5) {
 379             dims = {static_cast<size_t>(size.batch[0]),
 380                     static_cast<size_t>(size.feature[0]),
 381                     static_cast<size_t>(size.spatial[2]),
 382                     static_cast<size_t>(size.spatial[1]),
 383                     static_cast<size_t>(size.spatial[0])};
 384             l = Layout::NCDHW;
 385         } else if (layout.format.dimension() == 6) {
 386             dims = {static_cast<size_t>(size.batch[0]),
 387                     static_cast<size_t>(size.feature[0]),
 388                     static_cast<size_t>(size.spatial[3]),
 389                     static_cast<size_t>(size.spatial[2]),
 390                     static_cast<size_t>(size.spatial[1]),
 391                     static_cast<size_t>(size.spatial[0])};
 392             // Should be NC?DHW but there is no such layout yet
 393             l = Layout::BLOCKED;
 394         }
 395         TensorDesc dst{precision, dims, l};
 396         return dst;
 397     };
 398
 399     for (auto& pair : node2layer) {
 400         auto pi = pair.first;
 401         auto layer = pair.second;
 402         auto user_ids = pi.c_users;
 403         for (int i = 0; i < user_ids.size(); i++) {
 404             auto it = std::find_if(node2layer.begin(), node2layer.end(), [&](std::pair<cldnn::primitive_info, CNNLayerPtr>& entry) {
 405                 return entry.first.original_id == user_ids[i];
 406             });
 407
 408             if (it == node2layer.end())
 409                 continue;
 410
 411             auto& child_layer = it->second;
 412
 413             DataPtr data;
 414             if (i < layer->outData.size()) {
 415                 std::string data_name = pi.original_id + "_out" + std::to_string(i);
 416                 layer->outData[i] = std::make_shared<Data>(data_name, desc_from_layout(pi.output_layout));
 417                 data = layer->outData[i];
 418                 data->getCreatorLayer() = layer;
 419             } else {
 420                 data = layer->outData[0];
 421             }
 422
 423             int in_port_id = 0;
 424             for (auto& dep : it->first.c_dependencies) {
 425                 if (filter_const_primitives) {
 426                     auto it = std::find_if(node2layer.begin(), node2layer.end(), [&](std::pair<cldnn::primitive_info, CNNLayerPtr>& entry) {
 427                         return entry.first.original_id == dep;
 428                     });
 429
 430                     if (it == node2layer.end())
 431                         continue;
 432                 }
 433
 434                 if (dep == pi.original_id && child_layer->insData[in_port_id].lock() == nullptr) {
 435                     data->getInputTo()[child_layer->name] = child_layer;
 436                     child_layer->insData[in_port_id] = data;
 437                     break;
 438                 }
 439                 in_port_id++;
 440             }
 441         }
 442     }
 443     // Specify inputs data
 444     for (auto& pair : node2layer) {
 445         auto pi = pair.first;
 446         auto layer = pair.second;
 447         if (pi.c_dependencies.size() != 0)
 448             continue;
 449
 450         auto in_info = std::make_shared<InputInfo>();
 451         if (layer->outData.empty())
 452             continue;
 453
 454         auto dt = layer->outData[0];
 455         auto tensor_desc = desc_from_layout(pi.output_layout);
 456
 457         dt->setDims(tensor_desc.getDims());
 458         dt->setPrecision(tensor_desc.getPrecision());
 459         dt->setLayout(tensor_desc.getLayout());
 460
 461         in_info->setInputData(dt);
 462         net->setInputInfo(in_info);
 463     }
 464
 465     return net;
 466 }
 467
 468 void CLDNNGraph::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) {
 469     auto primitives_info = GetNetwork()->get_primitives_info();
 470     graphPtr = GetExecGraphInfoByPrimitivesInfo(primitives_info, true);
 471 }
 472
 473
 474 void CLDNNGraph::UpdatePerfStatistics() {
 475     if (GetNetworksCount() == 0) {
 476         return;
 477     }
 478
 479     // Collect timings
 480     auto collectTimings = [](cldnn::instrumentation::profiling_info& cldnnInfo, PerfCounter& pc) {
 481         for (auto &interval : cldnnInfo.intervals) {
 482             using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
 483             auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
 484
 485             if (interval.name == "submission") {
 486                 pc.cpu_uSec += count;
 487             } else if (interval.name == "executing") {
 488                 pc.realTime_uSec += count;
 489             } else if (interval.name == "duration") {  // "duration" is used for CPU layers
 490                 pc.cpu_uSec += count;
 491
 492                 if (pc.num == 0)
 493                     pc.isCPU = true;
 494             }
 495         }
 496     };
 497
 498     std::map<cldnn::primitive_id, cldnn::event> executedPrimitives = GetNetwork()->get_executed_primitives();
 499     auto allPrimitives = GetNetwork()->get_all_primitives();
 500
 501     // Get profiling info for all layers
 502     for (auto &profiledID : profilingIDs) {
 503         auto pcIter = perfMap.find(profiledID);
 504
 505         if (pcIter == perfMap.end())  continue;
 506
 507         auto execIter = executedPrimitives.find(profiledID);
 508         auto& perfCount = pcIter->second.second;
 509         // Change status if layer wasn't executed by cldnn engine
 510         if (execIter == executedPrimitives.end()) {
 511             if (perfCount.num == 0) {
 512                 perfCount.status = InferenceEngineProfileInfo::OPTIMIZED_OUT;
 513             }
 514             continue;
 515         }
 516
 517         auto event = execIter->second;
 518         executedPrimitives.erase(execIter);
 519
 520         cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event.get_profiling_info()};
 521
 522         collectTimings(cldnnInfo, perfCount);
 523         perfCount.num++;
 524     }
 525
 526     for (auto &executedID : executedPrimitives) {
 527         auto pcIter = perfMap.find(executedID.first);
 528         if (pcIter == perfMap.end()) {
 529             perfMap[executedID.first].first = executedID.first;
 530             pcIter = perfMap.find(executedID.first);
 531             auto& perfCount = pcIter->second.second;
 532
 533             cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second.get_profiling_info()};
 534
 535             collectTimings(cldnnInfo, perfCount);
 536             perfCount.num++;
 537         }
 538     }
 539 }
 540
 541 bool CLDNNGraph::IsLoaded() const {
 542     return GetNetwork() != nullptr;
 543 }
 544
 545 void CLDNNGraph::UpdateImplementationsMap() {
 546     if (m_config.useProfiling) {
 547         auto extractImplementationFromInfo = [](const std::string& info) -> std::string {
 548             std::string def_implementation = "undef";
 549             std::string impl_section = "implementation :";
 550             std::string::size_type pos = info.find(impl_section);
 551             if (pos == std::string::npos) {
 552                 return def_implementation;
 553             }
 554
 555             std::string::size_type end_pos = info.find(',', pos);
 556             if (end_pos == std::string::npos) {
 557                 return def_implementation;
 558             }
 559
 560             std::string::size_type length = end_pos - pos - impl_section.size();
 561
 562             auto trim = [](const std::string& str) {
 563                 size_t first = str.find_first_not_of(' ');
 564                 if (std::string::npos == first) {
 565                     return str;
 566                 }
 567                 size_t last = str.find_last_not_of(' ');
 568                 return str.substr(first, (last - first + 1));
 569             };
 570             std::string tmp = trim(info.substr(pos + impl_section.size(), length));
 571
 572             return tmp.length() > 1 ? tmp : def_implementation;
 573         };
 574
 575         // Parse primitive info and extract implementation name.
 576         for (auto& id : profilingIDs) {
 577             std::string prim_info = "";
 578             try {
 579                 prim_info = GetNetwork()->get_primitive_info(id);
 580             } catch (std::exception& /*e*/) { }
 581
 582             implementationsMap.insert({id, extractImplementationFromInfo(prim_info)});
 583         }
 584     }
 585 }
 586
 587 void CLDNNGraph::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &result) const {
 588     bool combinePrimByIRLayers = false;
 589     unsigned i = 0;
 590     auto allIds = GetNetwork()->get_all_primitive_org_ids();
 591     auto executedPrimitives = GetNetwork()->get_executed_primitives();
 592     auto primitivesInfo = GetNetwork()->get_primitives_info();
 593
 594     auto getUpperCaseName = [&](std::string name) {
 595         if (name.length() > 0)
 596             name[0] = toupper(name[0]);
 597         return name;
 598     };
 599
 600     auto getFromProfiling = [&](std::string primId) -> bool {
 601         auto perfIter = perfMap.find(primId);
 602
 603         if (perfIter == perfMap.end())  return false;
 604
 605         const auto& layerName = perfIter->second.first;
 606         if (layerName.length() == 0)  // no layer directly associated
 607             return false;
 608
 609         const auto& perfCounter = perfIter->second.second;
 610
 611         if (!perfCounter.parentPrimitive.empty() && combinePrimByIRLayers)
 612             return false;
 613
 614         auto& extPerfEntry = result[layerName];
 615
 616         memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type));
 617         if (perfCounter.isCPU) {
 618             static const std::string cpuExecType("CPU");
 619             cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length());  // Override execType as CPU
 620         } else {
 621             std::string impl = implementationsMap.at(primId);
 622             impl.copy(extPerfEntry.exec_type, impl.length());
 623         }
 624
 625         extPerfEntry.execution_index = i++;
 626         extPerfEntry.status = perfCounter.status;
 627         extPerfEntry.cpu_uSec = perfCounter.cpu_avg();
 628         extPerfEntry.realTime_uSec = perfCounter.realTime_avg();
 629
 630         if (combinePrimByIRLayers) {
 631             std::string kernelId = "";
 632             long long kernelTime = 0;  // used for finding the most complex computation kernel in sub_graph for perf stat
 633             for (auto &id : profilingIDs) {
 634                 auto iter = perfMap.find(id);
 635                 if (iter == perfMap.end())  continue;
 636
 637                 const auto &pc = iter->second.second;
 638                 if (id != primId && pc.parentPrimitive == primId) {
 639                     extPerfEntry.cpu_uSec += pc.cpu_avg();
 640                     extPerfEntry.realTime_uSec += pc.realTime_avg();
 641                     if (pc.realTime_avg() > kernelTime) {
 642                         kernelTime = pc.realTime_avg();
 643                         kernelId = id;
 644                     }
 645                     allIds.erase(std::find(allIds.begin(), allIds.end(), id));
 646                 }
 647             }
 648             if (!kernelId.empty())
 649                 implementationsMap.at(kernelId).copy(extPerfEntry.exec_type, implementationsMap.at(kernelId).length());
 650         }
 651
 652         getUpperCaseName(perfCounter.layerType).copy(extPerfEntry.layer_type, perfCounter.layerType.length());
 653         return true;
 654     };
 655
 656     // Step 1. Get all primitives in execution order which was added by clDNNPlugin
 657     for (auto& primId : profilingIDs) {
 658         getFromProfiling(primId);
 659     }
 660
 661     // Step 2. Find all other primitives which was added while optimization process and executed after
 662     for (auto& primId : allIds) {
 663         auto perfIter = perfMap.find(primId);
 664         if (perfIter == perfMap.end())  continue;
 665
 666         bool existInProfiling = std::find(profilingIDs.begin(), profilingIDs.end(), primId) != profilingIDs.end();
 667         if ((!existInProfiling || (existInProfiling && perfIter->second.first.length() == 0)) &&
 668             executedPrimitives.find(primId) != executedPrimitives.end()) {
 669             auto event = executedPrimitives.at(primId);
 670
 671             cldnn::instrumentation::profiling_info cldnnInfo{primId, event.get_profiling_info()};
 672
 673             // Collect timings
 674             long long cpuTime = 0;
 675             long long deviceTime = 0;
 676
 677             for (auto &interval : cldnnInfo.intervals) {
 678                 using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
 679                 auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
 680
 681                 if (interval.name == "submission") {
 682                     cpuTime += count;
 683                 } else if (interval.name == "executing") {
 684                     deviceTime += count;
 685                 } else if (interval.name == "duration") {  // "duration" is used for CPU layers
 686                     cpuTime += count;
 687                 }
 688             }
 689
 690             std::string layerName = primId;
 691             if (primId.find(":") != std::string::npos) {
 692                 layerName = primId.substr(primId.find(":") + 1, primId.length());
 693             }
 694
 695             for (auto& pi : primitivesInfo) {
 696                 if (pi.original_id == primId) {
 697                     if (pi.type_id == "mutable_data")
 698                         continue;
 699
 700                     auto& extPerfEntry = result[layerName];
 701
 702                     if (pi.is_cpu) {
 703                         static const std::string cpuExecType("CPU");
 704                         memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type));
 705                         cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length());  // Override execType as CPU
 706                     } else {
 707                         std::string impl = pi.kernel_id;
 708                         impl.copy(extPerfEntry.exec_type, impl.length());
 709                     }
 710
 711                     getUpperCaseName(pi.type_id).copy(extPerfEntry.layer_type, pi.type_id.length());
 712                     extPerfEntry.execution_index = i++;
 713                     extPerfEntry.status = InferenceEngineProfileInfo::LayerStatus::EXECUTED;
 714                     extPerfEntry.cpu_uSec = cpuTime;
 715                     extPerfEntry.realTime_uSec = deviceTime;
 716
 717                     if (pi.type_id == "input_layout") {
 718                         const std::string input_string = "Input";
 719                         const std::string undef_string = "undef";
 720                         input_string.copy(extPerfEntry.layer_type, 256);
 721                         undef_string.copy(extPerfEntry.exec_type, 256);
 722                     }
 723                 }
 724             }
 725         }
 726     }
 727
 728     // Step 3. Checking primitives which has been deleted from execution order but added by clDNNPlugin
 729     for (auto& primId : profilingIDs)
 730         if (std::find(allIds.begin(), allIds.end(), primId) == allIds.end()) {
 731             getFromProfiling(primId);
 732         }
 733 }
 734
 735 std::shared_ptr<cldnn::network> CLDNNGraph::GetNetwork(size_t idx) const {
 736     if (idx >= GetNetworksCount())
 737         THROW_IE_EXCEPTION << "Unable to find network with id=" << idx << ". Stored networks count: " << GetNetworksCount();
 738
 739     return m_networks[idx];
 740 }
 741
 742
 743 std::string CLDNNGraph::MapOutputName(std::string outName) const {
 744     auto networkOutputsIDs = GetNetwork()->get_output_ids();
 745     auto allPrimitiveIds = GetNetwork()->get_all_primitives();
 746
 747     // Find correct output ID. Start with name stored in IR.
 748     std::string outputID = primitiveIDs.at(outName);
 749     while (std::find(networkOutputsIDs.begin(), networkOutputsIDs.end(), outputID) == networkOutputsIDs.end()) {
 750         // If current ID isn't found in cldnn network outputs, get previous primitive id and try again.
 751         auto prim = allPrimitiveIds.find(outputID);
 752         if (prim == allPrimitiveIds.end()) {
 753             THROW_IE_EXCEPTION << "Unknown primitive id " << outputID;
 754         }
 755
 756         if (prevPrimitiveIDs.at(outputID).size() != 1 || prim->second != "_optimized_") {
 757             THROW_IE_EXCEPTION << "Unable to find parent for output primitive " << outputID;
 758         }
 759         outputID = prevPrimitiveIDs.at(outputID)[0];
 760     }
 761
 762     return outputID;
 763 }
 764
 765 InferenceEngine::SizeVector CLDNNGraph::GetOutputSize(std::string outName) const {
 766     auto res_output = outputDims.find(outName);
 767
 768     InferenceEngine::SizeVector sz;
 769     if (res_output != outputDims.end())
 770         sz = res_output->second;
 771     else
 772         sz = outputDims.at(primitiveIDs.at(outName));
 773
 774     return sz;
 775 }
 776
 777 };  // namespace CLDNNPlugin