inference-engine/samples/benchmark_app/main.cpp

   1 // Copyright (C) 2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <algorithm>
   6 #include <chrono>
   7 #include <memory>
   8 #include <map>
   9 #include <string>
  10 #include <vector>
  11 #include <utility>
  12
  13 #include <inference_engine.hpp>
  14 #include <format_reader_ptr.h>
  15
  16 #include <vpu/vpu_plugin_config.hpp>
  17 #include <samples/common.hpp>
  18 #include <samples/slog.hpp>
  19 #include <samples/args_helper.hpp>
  20
  21 #include "benchmark_app.hpp"
  22 #include "infer_request_wrap.hpp"
  23 #include "progress_bar.hpp"
  24 #include "statistics_report.hpp"
  25
  26 using namespace InferenceEngine;
  27
  28 long long getDurationInNanoseconds(const std::string& device);
  29
  30 void fillBlobWithImage(
  31     Blob::Ptr& inputBlob,
  32     const std::vector<std::string>& filePaths,
  33     const size_t& batchSize,
  34     const InferenceEngine::InputInfo& info);
  35
  36 static const size_t progressBarDefaultTotalCount = 1000;
  37
  38 bool ParseAndCheckCommandLine(int argc, char *argv[]) {
  39     // ---------------------------Parsing and validation of input args--------------------------------------
  40     slog::info << "Parsing input parameters" << slog::endl;
  41     gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
  42     if (FLAGS_h) {
  43         showUsage();
  44         return false;
  45     }
  46
  47     if (FLAGS_m.empty()) {
  48         throw std::logic_error("Model required is not set. Please use -h.");
  49     }
  50
  51     if (FLAGS_api.empty()) {
  52         throw std::logic_error("API not selected. Please use -h.");
  53     }
  54
  55     if (FLAGS_api != "async" && FLAGS_api != "sync") {
  56         throw std::logic_error("Incorrect API. Please use -h.");
  57     }
  58
  59     if (FLAGS_i.empty()) {
  60         throw std::logic_error("Input is not set. Please use -h.");
  61     }
  62
  63     if (FLAGS_niter < 0) {
  64         throw std::logic_error("Number of iterations should be positive (invalid -niter option value)");
  65     }
  66
  67     if (FLAGS_nireq < 0) {
  68         throw std::logic_error("Number of inference requests should be positive (invalid -nireq option value)");
  69     }
  70
  71     if (FLAGS_b < 0) {
  72         throw std::logic_error("Batch size should be positive (invalid -b option value)");
  73     }
  74
  75     if (!FLAGS_report_type.empty() &&
  76          FLAGS_report_type != noCntReport && FLAGS_report_type != medianCntReport && FLAGS_report_type != detailedCntReport) {
  77         std::string err = "only " + std::string(noCntReport) + "/" + std::string(medianCntReport) + "/" + std::string(detailedCntReport) +
  78                 " report types are supported (invalid -report_type option value)";
  79         throw std::logic_error(err);
  80     }
  81
  82     return true;
  83 }
  84
  85 /**
  86 * @brief The entry point the benchmark application
  87 */
  88 int main(int argc, char *argv[]) {
  89     try {
  90         slog::info << "InferenceEngine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl;
  91
  92         // ------------------------------ Parsing and validation of input args ---------------------------------
  93         std::cout << std::endl << "[Step 1/8] Parsing and validation of input args" << std::endl;
  94         ProgressBar progressBar(1, FLAGS_stream_output);
  95
  96         if (!ParseAndCheckCommandLine(argc, argv)) {
  97             return 0;
  98         }
  99
 100         /** This vector stores paths to the processed images **/
 101         std::vector<std::string> inputImages;
 102         parseInputFilesArguments(inputImages);
 103         if (inputImages.size() == 0ULL) {
 104             throw std::logic_error("no images found");
 105         }
 106         progressBar.addProgress(1);
 107         progressBar.finish();
 108
 109         // --------------------------- 1. Load Plugin for inference engine -------------------------------------
 110
 111         std::cout << "[Step 2/8] Loading plugin" << std::endl;
 112         progressBar.newBar(1);
 113
 114         InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d);
 115
 116         if (!FLAGS_l.empty()) {
 117             // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
 118             const std::shared_ptr<IExtension> extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
 119             plugin.AddExtension(extension_ptr);
 120             slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
 121         } else if (!FLAGS_c.empty()) {
 122             // Load clDNN Extensions
 123             plugin.SetConfig({ {CONFIG_KEY(CONFIG_FILE), FLAGS_c} });
 124             slog::info << "GPU extensions is loaded " << FLAGS_c << slog::endl;
 125         }
 126
 127         InferenceEngine::ResponseDesc resp;
 128         if (FLAGS_d == "MYRIAD") {
 129             plugin.SetConfig({ {CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)}, {VPU_CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)} });
 130         }
 131
 132         const Version *pluginVersion = plugin.GetVersion();
 133         slog::info << pluginVersion << slog::endl;
 134
 135         progressBar.addProgress(1);
 136         progressBar.finish();
 137
 138         // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
 139
 140         std::cout << "[Step 3/8] Read IR network" << std::endl;
 141         progressBar.newBar(1);
 142
 143         slog::info << "Loading network files" << slog::endl;
 144
 145         InferenceEngine::CNNNetReader netBuilder;
 146         netBuilder.ReadNetwork(FLAGS_m);
 147         const std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
 148         netBuilder.ReadWeights(binFileName);
 149
 150         InferenceEngine::CNNNetwork cnnNetwork = netBuilder.getNetwork();
 151         const InferenceEngine::InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
 152         if (inputInfo.empty()) {
 153             throw std::logic_error("no inputs info is provided");
 154         }
 155
 156         if (inputInfo.size() != 1) {
 157             throw std::logic_error("only networks with one input are supported");
 158         }
 159
 160         // --------------------------- 3. Resize network to match image sizes and given batch----------------------
 161
 162         if (FLAGS_b != 0) {
 163             // We support models having only one input layers
 164             ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
 165             const ICNNNetwork::InputShapes::iterator& it = shapes.begin();
 166             if (it->second.size() != 4) {
 167                 throw std::logic_error("Unsupported model for batch size changing in automatic mode");
 168             }
 169             it->second[0] = FLAGS_b;
 170             slog::info << "Resizing network to batch = " << FLAGS_b << slog::endl;
 171             cnnNetwork.reshape(shapes);
 172         }
 173
 174         const size_t batchSize = cnnNetwork.getBatchSize();
 175         const Precision precision = inputInfo.begin()->second->getPrecision();
 176         slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize <<
 177             ", precision: " << precision << slog::endl;
 178
 179         progressBar.addProgress(1);
 180         progressBar.finish();
 181
 182         // --------------------------- 4. Configure input & output ---------------------------------------------
 183
 184         std::cout << "[Step 4/8] Configure input & output of the model" << std::endl;
 185         progressBar.newBar(1);
 186
 187         const InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::U8;
 188         for (auto& item : inputInfo) {
 189             /** Set the precision of input data provided by the user, should be called before load of the network to the plugin **/
 190             item.second->setInputPrecision(inputPrecision);
 191         }
 192
 193         const size_t imagesCount = inputImages.size();
 194         if (batchSize > imagesCount) {
 195             slog::warn << "Network batch size " << batchSize << " is greater than images count " << imagesCount <<
 196                 ", some input files will be duplicated" << slog::endl;
 197         } else if (batchSize < imagesCount) {
 198             slog::warn << "Network batch size " << batchSize << " is less then images count " << imagesCount <<
 199                 ", some input files will be ignored" << slog::endl;
 200         }
 201
 202         // ------------------------------ Prepare output blobs -------------------------------------------------
 203         slog::info << "Preparing output blobs" << slog::endl;
 204         InferenceEngine::OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
 205         InferenceEngine::BlobMap outputBlobs;
 206         for (auto& item : outputInfo) {
 207             const InferenceEngine::DataPtr outData = item.second;
 208             if (!outData) {
 209                 throw std::logic_error("output data pointer is not valid");
 210             }
 211             InferenceEngine::SizeVector outputDims = outData->dims;
 212             const InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
 213
 214             /** Set the precision of output data provided by the user, should be called before load of the network to the plugin **/
 215             outData->setPrecision(outputPrecision);
 216             InferenceEngine::TBlob<float>::Ptr output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
 217             output->allocate();
 218             outputBlobs[item.first] = output;
 219         }
 220
 221         progressBar.addProgress(1);
 222         progressBar.finish();
 223
 224         // --------------------------- 5. Loading model to the plugin ------------------------------------------
 225
 226         std::cout << "[Step 5/8] Loading model to the plugin " << std::endl;
 227         progressBar.newBar(1);
 228
 229         std::map<std::string, std::string> networkConfig;
 230         if (FLAGS_d.find("CPU") != std::string::npos) {  // CPU supports few special performance-oriented keys
 231             // limit threading for CPU portion of inference
 232             if (FLAGS_nthreads != 0)
 233                 networkConfig[PluginConfigParams::KEY_CPU_THREADS_NUM] = std::to_string(FLAGS_nthreads);
 234             // pin threads for CPU portion of inference
 235             networkConfig[PluginConfigParams::KEY_CPU_BIND_THREAD] = FLAGS_pin;
 236             // for pure CPU execution, more throughput-oriented execution via streams
 237             if (FLAGS_api == "async" && FLAGS_d == "CPU")
 238                 networkConfig[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(FLAGS_nireq);
 239         }
 240
 241         if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) {
 242             networkConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
 243         }
 244
 245         InferenceEngine::ExecutableNetwork exeNetwork = plugin.LoadNetwork(cnnNetwork, networkConfig);
 246
 247         progressBar.addProgress(1);
 248         progressBar.finish();
 249
 250         // --------------------------- 6. Create infer requests and fill input blobs ---------------------------
 251
 252         std::cout << "[Step 6/8] Create infer requests and fill input blobs with images" << std::endl;
 253         progressBar.newBar(1);
 254
 255         std::vector<InferReqWrap::Ptr> inferRequests;
 256         auto numOfReq = (FLAGS_api == "async") ? FLAGS_nireq : 1;
 257         inferRequests.reserve(numOfReq);
 258
 259         for (size_t i = 0; i < numOfReq; i++) {
 260             inferRequests.push_back(std::make_shared<InferReqWrap>(exeNetwork));
 261             slog::info << "Infer Request " << i << " created" << slog::endl;
 262
 263             for (const InputsDataMap::value_type& item : inputInfo) {
 264                 Blob::Ptr inputBlob = inferRequests[i]->getBlob(item.first);
 265                 fillBlobWithImage(inputBlob, inputImages, batchSize, *item.second);
 266             }
 267         }
 268
 269         progressBar.addProgress(1);
 270         progressBar.finish();
 271
 272         // --------------------------- 7. Performance measurements stuff ------------------------------------------
 273
 274         long long durationInNanoseconds;
 275         if (FLAGS_niter != 0) {
 276             durationInNanoseconds = 0LL;
 277         } else {
 278             durationInNanoseconds = getDurationInNanoseconds(FLAGS_d);
 279         }
 280
 281         std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> emptyStat = {};
 282         StatisticsReport::Config config = {
 283             FLAGS_d,
 284             FLAGS_api,
 285             batchSize,
 286             FLAGS_nireq,
 287             FLAGS_niter,
 288             FLAGS_nthreads,
 289             FLAGS_pin,
 290             FLAGS_report_type,
 291             FLAGS_report_folder
 292         };
 293         StatisticsReport statistics(config);
 294         double fps;
 295         double totalDuration;
 296
 297         size_t progressCnt = 0;
 298         size_t progressBarTotalCount;
 299         size_t iteration = 0;
 300
 301         if (FLAGS_api == "sync") {
 302             InferReqWrap::Ptr inferRequest = inferRequests[0];
 303
 304             std::cout << "[Step 7/8] ";
 305             if (FLAGS_niter != 0) {
 306                 std::cout << "Start inference synchronously (" << FLAGS_niter << " sync inference executions)" << std::endl;
 307                 progressBarTotalCount = FLAGS_niter;
 308             } else {
 309                 std::cout << "Start inference synchronously (" << durationInNanoseconds * 0.000001 << " ms duration)" << std::endl;
 310                 progressBarTotalCount = progressBarDefaultTotalCount;
 311             }
 312
 313             // warming up - out of scope
 314             inferRequest->infer();
 315
 316             const auto startTime = Time::now();
 317             auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
 318
 319             /** Start inference & calculate performance **/
 320             progressBar.newBar(progressBarTotalCount);
 321             while ((iteration < FLAGS_niter) ||
 322                    ((FLAGS_niter == 0) && (execTime < durationInNanoseconds))) {
 323                 inferRequest->infer();
 324                 statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
 325                                inferRequest->getPerformanceCounts() : emptyStat,
 326                                inferRequest->getExecTime());
 327
 328                 iteration++;
 329
 330                 if (FLAGS_niter > 0) {
 331                     progressBar.addProgress(1);
 332                 } else {
 333                     execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
 334                     // calculate how many progress intervals are covered by current iteration.
 335                     // depends on the current iteration time and time of each progress interval.
 336                     // Previously covered progress intervals must be skipped.
 337                     auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount;
 338                     size_t newProgress = execTime / progressIntervalTime - progressCnt;
 339                     progressBar.addProgress(newProgress);
 340                     progressCnt += newProgress;
 341                 }
 342             }
 343             fps = batchSize * 1000.0 / statistics.getMedianLatency();
 344             totalDuration = std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
 345             progressBar.finish();
 346         } else {
 347             std::cout << "[Step 7/8] ";
 348             if (FLAGS_niter != 0) {
 349                 std::cout << "Start inference asynchronously (" << FLAGS_niter <<
 350                     " async inference executions, " << FLAGS_nireq <<
 351                     " inference requests in parallel)" << std::endl;
 352                 progressBarTotalCount = FLAGS_niter + FLAGS_nireq - 1;
 353             } else {
 354                 std::cout << std::endl << "Start inference asynchronously (" << durationInNanoseconds * 0.000001 <<
 355                     " ms duration, " << FLAGS_nireq <<
 356                     " inference requests in parallel)" << std::endl;
 357                 progressBarTotalCount = 1000;
 358             }
 359
 360
 361             size_t currentInference = 0ULL;
 362             bool requiredInferenceRequestsWereExecuted = false;
 363             long long previousInference = 1LL - FLAGS_nireq;
 364
 365             // warming up - out of scope
 366             inferRequests[0]->startAsync();
 367             inferRequests[0]->wait();
 368
 369             const auto startTime = Time::now();
 370             auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
 371
 372             /** Start inference & calculate performance **/
 373             /** to use FLAGS_niter + FLAGS_nireq - 1 to guarantee that last infer requests are executed in the same conditions **/
 374             progressBar.newBar(progressBarTotalCount);
 375             while ((!requiredInferenceRequestsWereExecuted) ||
 376                 (iteration < FLAGS_niter + FLAGS_nireq - 1) ||
 377                 ((FLAGS_niter == 0LL) && (execTime < durationInNanoseconds))) {
 378                 // start new inference
 379                 inferRequests[currentInference]->startAsync();
 380
 381                 // wait the latest inference execution if exists
 382                 if (previousInference >= 0) {
 383                     inferRequests[previousInference]->wait();
 384                     // update statistics with PM counters only in case of detailed or median reports
 385                     statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
 386                                    inferRequests[previousInference]->getPerformanceCounts() : emptyStat,
 387                                    inferRequests[previousInference]->getExecTime());
 388                 }
 389
 390                 currentInference++;
 391                 if (currentInference >= FLAGS_nireq) {
 392                     currentInference = 0;
 393                     requiredInferenceRequestsWereExecuted = true;
 394                 }
 395
 396                 previousInference++;
 397                 if (previousInference >= FLAGS_nireq) {
 398                     previousInference = 0;
 399                 }
 400
 401                 iteration++;
 402
 403                 if (FLAGS_niter > 0) {
 404                     progressBar.addProgress(1);
 405                 } else {
 406                     execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
 407                     // calculate how many progress intervals are covered by current iteration.
 408                     // depends on the current iteration time and time of each progress interval.
 409                     // Previously covered progress intervals must be skipped.
 410                     auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount;
 411                     size_t newProgress = execTime / progressIntervalTime - progressCnt;
 412                     progressBar.addProgress(newProgress);
 413                     progressCnt += newProgress;
 414                 }
 415             }
 416
 417             // wait the latest inference executions
 418             for (size_t notCompletedIndex = 0ULL; notCompletedIndex < (FLAGS_nireq - 1); ++notCompletedIndex) {
 419                 if (previousInference >= 0) {
 420                     inferRequests[previousInference]->wait();
 421                     // update statistics with PM counters only in case of detailed or median reports
 422                     statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
 423                                    inferRequests[previousInference]->getPerformanceCounts() : emptyStat,
 424                                    inferRequests[previousInference]->getExecTime());
 425                 }
 426
 427                 previousInference++;
 428                 if (previousInference >= FLAGS_nireq) {
 429                     previousInference = 0LL;
 430                 }
 431             }
 432             totalDuration = std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
 433             fps = batchSize * 1000.0 * iteration / totalDuration;
 434             progressBar.finish();
 435         }
 436
 437         std::cout << "[Step 8/8] Dump statistics report" << std::endl;
 438         progressBar.newBar(1);
 439         statistics.dump(fps, iteration, totalDuration);
 440
 441         if (!FLAGS_exec_graph_path.empty()) {
 442             CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo();
 443             execGraphInfo.serialize(FLAGS_exec_graph_path);
 444             slog::info << "executable graph is stored to " << FLAGS_exec_graph_path << slog::endl;
 445         }
 446         progressBar.addProgress(1);
 447         progressBar.finish();
 448
 449         std::cout << "Latency: " << statistics.getMedianLatency() << " ms" << std::endl;
 450         std::cout << "Throughput: " << fps << " FPS" << std::endl;
 451     } catch (const std::exception& ex) {
 452         slog::err << ex.what() << slog::endl;
 453         return 3;
 454     }
 455
 456     return 0;
 457 }
 458
 459 long long getDurationInNanoseconds(const std::string& device) {
 460     static const std::vector<std::pair<std::string, long long>> deviceDurationsInSeconds{
 461             { "CPU", 60LL },
 462             { "GPU", 60LL },
 463             { "VPU", 60LL },
 464             { "MYRIAD", 60LL },
 465             { "HDDL", 60LL },
 466             { "FPGA", 120LL },
 467             { "UNKNOWN", 120LL }
 468     };
 469
 470     auto duration = 0LL;
 471     for (const auto& deviceDurationInSeconds : deviceDurationsInSeconds) {
 472         if (device.find(deviceDurationInSeconds.first) != std::string::npos) {
 473             duration = std::max(duration, deviceDurationInSeconds.second);
 474         }
 475     }
 476
 477     if (duration == 0LL) {
 478         const auto unknownDeviceIt = find_if(
 479             deviceDurationsInSeconds.begin(),
 480             deviceDurationsInSeconds.end(),
 481             [](std::pair<std::string, long long> deviceDuration) { return deviceDuration.first == "UNKNOWN"; });
 482
 483         if (unknownDeviceIt == deviceDurationsInSeconds.end()) {
 484             throw std::logic_error("UNKNOWN device was not found in device duration list");
 485         }
 486         duration = unknownDeviceIt->second;
 487         slog::warn << "Default duration " << duration << " seconds for unknown device '" << device << "' is used" << slog::endl;
 488     }
 489
 490     return duration * 1000000000LL;
 491 }
 492
 493 void fillBlobWithImage(
 494     Blob::Ptr& inputBlob,
 495     const std::vector<std::string>& filePaths,
 496     const size_t& batchSize,
 497     const InferenceEngine::InputInfo& info) {
 498
 499     auto inputBlobData = inputBlob->buffer().as<uint8_t*>();
 500     const SizeVector& inputBlobDims = inputBlob->dims();
 501
 502     slog::info << "Network Input dimensions (" << info.getTensorDesc().getLayout() << "): ";
 503     for (const auto& i : info.getTensorDesc().getDims()) {
 504         slog::info << i << " ";
 505     }
 506     slog::info << slog::endl;
 507
 508     /** Collect images data ptrs **/
 509     std::vector<std::shared_ptr<uint8_t>> vreader;
 510     vreader.reserve(batchSize);
 511
 512     for (size_t i = 0ULL, inputIndex = 0ULL; i < batchSize; i++, inputIndex++) {
 513         if (inputIndex >= filePaths.size()) {
 514             inputIndex = 0ULL;
 515         }
 516
 517         slog::info << "Prepare image " << filePaths[inputIndex] << slog::endl;
 518         FormatReader::ReaderPtr reader(filePaths[inputIndex].c_str());
 519         if (reader.get() == nullptr) {
 520             slog::warn << "Image " << filePaths[inputIndex] << " cannot be read!" << slog::endl << slog::endl;
 521             continue;
 522         }
 523
 524         /** Getting image data **/
 525         std::shared_ptr<uint8_t> imageData(reader->getData(info.getDims()[0], info.getDims()[1]));
 526         if (imageData) {
 527             vreader.push_back(imageData);
 528         }
 529     }
 530
 531     /** Fill input tensor with images. First b channel, then g and r channels **/
 532     const size_t numChannels = inputBlobDims[2];
 533     const size_t imageSize = inputBlobDims[1] * inputBlobDims[0];
 534     /** Iterate over all input images **/
 535     for (size_t imageId = 0; imageId < vreader.size(); ++imageId) {
 536         /** Iterate over all pixel in image (b,g,r) **/
 537         for (size_t pid = 0; pid < imageSize; pid++) {
 538             /** Iterate over all channels **/
 539             for (size_t ch = 0; ch < numChannels; ++ch) {
 540                 /**          [images stride + channels stride + pixel id ] all in bytes            **/
 541                 inputBlobData[imageId * imageSize * numChannels + ch * imageSize + pid] = vreader.at(imageId).get()[pid*numChannels + ch];
 542             }
 543         }
 544     }
 545 }