1 // Copyright (C) 2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
13 #include <inference_engine.hpp>
14 #include <format_reader_ptr.h>
16 #include <vpu/vpu_plugin_config.hpp>
17 #include <samples/common.hpp>
18 #include <samples/slog.hpp>
19 #include <samples/args_helper.hpp>
21 #include "benchmark_app.hpp"
22 #include "infer_request_wrap.hpp"
23 #include "progress_bar.hpp"
24 #include "statistics_report.hpp"
26 using namespace InferenceEngine;
28 long long getDurationInNanoseconds(const std::string& device);
30 void fillBlobWithImage(
32 const std::vector<std::string>& filePaths,
33 const size_t& batchSize,
34 const InferenceEngine::InputInfo& info);
36 static const size_t progressBarDefaultTotalCount = 1000;
38 bool ParseAndCheckCommandLine(int argc, char *argv[]) {
39 // ---------------------------Parsing and validation of input args--------------------------------------
40 slog::info << "Parsing input parameters" << slog::endl;
41 gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
47 if (FLAGS_m.empty()) {
48 throw std::logic_error("Model required is not set. Please use -h.");
51 if (FLAGS_api.empty()) {
52 throw std::logic_error("API not selected. Please use -h.");
55 if (FLAGS_api != "async" && FLAGS_api != "sync") {
56 throw std::logic_error("Incorrect API. Please use -h.");
59 if (FLAGS_i.empty()) {
60 throw std::logic_error("Input is not set. Please use -h.");
63 if (FLAGS_niter < 0) {
64 throw std::logic_error("Number of iterations should be positive (invalid -niter option value)");
67 if (FLAGS_nireq < 0) {
68 throw std::logic_error("Number of inference requests should be positive (invalid -nireq option value)");
72 throw std::logic_error("Batch size should be positive (invalid -b option value)");
75 if (!FLAGS_report_type.empty() &&
76 FLAGS_report_type != noCntReport && FLAGS_report_type != medianCntReport && FLAGS_report_type != detailedCntReport) {
77 std::string err = "only " + std::string(noCntReport) + "/" + std::string(medianCntReport) + "/" + std::string(detailedCntReport) +
78 " report types are supported (invalid -report_type option value)";
79 throw std::logic_error(err);
86 * @brief The entry point the benchmark application
88 int main(int argc, char *argv[]) {
90 slog::info << "InferenceEngine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl;
92 // ------------------------------ Parsing and validation of input args ---------------------------------
93 std::cout << std::endl << "[Step 1/8] Parsing and validation of input args" << std::endl;
94 ProgressBar progressBar(1, FLAGS_stream_output);
96 if (!ParseAndCheckCommandLine(argc, argv)) {
100 /** This vector stores paths to the processed images **/
101 std::vector<std::string> inputImages;
102 parseInputFilesArguments(inputImages);
103 if (inputImages.size() == 0ULL) {
104 throw std::logic_error("no images found");
106 progressBar.addProgress(1);
107 progressBar.finish();
109 // --------------------------- 1. Load Plugin for inference engine -------------------------------------
111 std::cout << "[Step 2/8] Loading plugin" << std::endl;
112 progressBar.newBar(1);
114 InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d);
116 if (!FLAGS_l.empty()) {
117 // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
118 const std::shared_ptr<IExtension> extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
119 plugin.AddExtension(extension_ptr);
120 slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
121 } else if (!FLAGS_c.empty()) {
122 // Load clDNN Extensions
123 plugin.SetConfig({ {CONFIG_KEY(CONFIG_FILE), FLAGS_c} });
124 slog::info << "GPU extensions is loaded " << FLAGS_c << slog::endl;
127 InferenceEngine::ResponseDesc resp;
128 if (FLAGS_d == "MYRIAD") {
129 plugin.SetConfig({ {CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)}, {VPU_CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)} });
132 const Version *pluginVersion = plugin.GetVersion();
133 slog::info << pluginVersion << slog::endl;
135 progressBar.addProgress(1);
136 progressBar.finish();
138 // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
140 std::cout << "[Step 3/8] Read IR network" << std::endl;
141 progressBar.newBar(1);
143 slog::info << "Loading network files" << slog::endl;
145 InferenceEngine::CNNNetReader netBuilder;
146 netBuilder.ReadNetwork(FLAGS_m);
147 const std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
148 netBuilder.ReadWeights(binFileName);
150 InferenceEngine::CNNNetwork cnnNetwork = netBuilder.getNetwork();
151 const InferenceEngine::InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
152 if (inputInfo.empty()) {
153 throw std::logic_error("no inputs info is provided");
156 if (inputInfo.size() != 1) {
157 throw std::logic_error("only networks with one input are supported");
160 // --------------------------- 3. Resize network to match image sizes and given batch----------------------
163 // We support models having only one input layers
164 ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
165 const ICNNNetwork::InputShapes::iterator& it = shapes.begin();
166 if (it->second.size() != 4) {
167 throw std::logic_error("Unsupported model for batch size changing in automatic mode");
169 it->second[0] = FLAGS_b;
170 slog::info << "Resizing network to batch = " << FLAGS_b << slog::endl;
171 cnnNetwork.reshape(shapes);
174 const size_t batchSize = cnnNetwork.getBatchSize();
175 const Precision precision = inputInfo.begin()->second->getPrecision();
176 slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize <<
177 ", precision: " << precision << slog::endl;
179 progressBar.addProgress(1);
180 progressBar.finish();
182 // --------------------------- 4. Configure input & output ---------------------------------------------
184 std::cout << "[Step 4/8] Configure input & output of the model" << std::endl;
185 progressBar.newBar(1);
187 const InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::U8;
188 for (auto& item : inputInfo) {
189 /** Set the precision of input data provided by the user, should be called before load of the network to the plugin **/
190 item.second->setInputPrecision(inputPrecision);
193 const size_t imagesCount = inputImages.size();
194 if (batchSize > imagesCount) {
195 slog::warn << "Network batch size " << batchSize << " is greater than images count " << imagesCount <<
196 ", some input files will be duplicated" << slog::endl;
197 } else if (batchSize < imagesCount) {
198 slog::warn << "Network batch size " << batchSize << " is less then images count " << imagesCount <<
199 ", some input files will be ignored" << slog::endl;
202 // ------------------------------ Prepare output blobs -------------------------------------------------
203 slog::info << "Preparing output blobs" << slog::endl;
204 InferenceEngine::OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
205 InferenceEngine::BlobMap outputBlobs;
206 for (auto& item : outputInfo) {
207 const InferenceEngine::DataPtr outData = item.second;
209 throw std::logic_error("output data pointer is not valid");
211 InferenceEngine::SizeVector outputDims = outData->dims;
212 const InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
214 /** Set the precision of output data provided by the user, should be called before load of the network to the plugin **/
215 outData->setPrecision(outputPrecision);
216 InferenceEngine::TBlob<float>::Ptr output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
218 outputBlobs[item.first] = output;
221 progressBar.addProgress(1);
222 progressBar.finish();
224 // --------------------------- 5. Loading model to the plugin ------------------------------------------
226 std::cout << "[Step 5/8] Loading model to the plugin " << std::endl;
227 progressBar.newBar(1);
229 std::map<std::string, std::string> networkConfig;
230 if (FLAGS_d.find("CPU") != std::string::npos) { // CPU supports few special performance-oriented keys
231 // limit threading for CPU portion of inference
232 if (FLAGS_nthreads != 0)
233 networkConfig[PluginConfigParams::KEY_CPU_THREADS_NUM] = std::to_string(FLAGS_nthreads);
234 // pin threads for CPU portion of inference
235 networkConfig[PluginConfigParams::KEY_CPU_BIND_THREAD] = FLAGS_pin;
236 // for pure CPU execution, more throughput-oriented execution via streams
237 if (FLAGS_api == "async" && FLAGS_d == "CPU")
238 networkConfig[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(FLAGS_nireq);
241 if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) {
242 networkConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
245 InferenceEngine::ExecutableNetwork exeNetwork = plugin.LoadNetwork(cnnNetwork, networkConfig);
247 progressBar.addProgress(1);
248 progressBar.finish();
250 // --------------------------- 6. Create infer requests and fill input blobs ---------------------------
252 std::cout << "[Step 6/8] Create infer requests and fill input blobs with images" << std::endl;
253 progressBar.newBar(1);
255 std::vector<InferReqWrap::Ptr> inferRequests;
256 auto numOfReq = (FLAGS_api == "async") ? FLAGS_nireq : 1;
257 inferRequests.reserve(numOfReq);
259 for (size_t i = 0; i < numOfReq; i++) {
260 inferRequests.push_back(std::make_shared<InferReqWrap>(exeNetwork));
261 slog::info << "Infer Request " << i << " created" << slog::endl;
263 for (const InputsDataMap::value_type& item : inputInfo) {
264 Blob::Ptr inputBlob = inferRequests[i]->getBlob(item.first);
265 fillBlobWithImage(inputBlob, inputImages, batchSize, *item.second);
269 progressBar.addProgress(1);
270 progressBar.finish();
272 // --------------------------- 7. Performance measurements stuff ------------------------------------------
274 long long durationInNanoseconds;
275 if (FLAGS_niter != 0) {
276 durationInNanoseconds = 0LL;
278 durationInNanoseconds = getDurationInNanoseconds(FLAGS_d);
281 std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> emptyStat = {};
282 StatisticsReport::Config config = {
293 StatisticsReport statistics(config);
295 double totalDuration;
297 size_t progressCnt = 0;
298 size_t progressBarTotalCount;
299 size_t iteration = 0;
301 if (FLAGS_api == "sync") {
302 InferReqWrap::Ptr inferRequest = inferRequests[0];
304 std::cout << "[Step 7/8] ";
305 if (FLAGS_niter != 0) {
306 std::cout << "Start inference synchronously (" << FLAGS_niter << " sync inference executions)" << std::endl;
307 progressBarTotalCount = FLAGS_niter;
309 std::cout << "Start inference synchronously (" << durationInNanoseconds * 0.000001 << " ms duration)" << std::endl;
310 progressBarTotalCount = progressBarDefaultTotalCount;
313 // warming up - out of scope
314 inferRequest->infer();
316 const auto startTime = Time::now();
317 auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
319 /** Start inference & calculate performance **/
320 progressBar.newBar(progressBarTotalCount);
321 while ((iteration < FLAGS_niter) ||
322 ((FLAGS_niter == 0) && (execTime < durationInNanoseconds))) {
323 inferRequest->infer();
324 statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
325 inferRequest->getPerformanceCounts() : emptyStat,
326 inferRequest->getExecTime());
330 if (FLAGS_niter > 0) {
331 progressBar.addProgress(1);
333 execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
334 // calculate how many progress intervals are covered by current iteration.
335 // depends on the current iteration time and time of each progress interval.
336 // Previously covered progress intervals must be skipped.
337 auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount;
338 size_t newProgress = execTime / progressIntervalTime - progressCnt;
339 progressBar.addProgress(newProgress);
340 progressCnt += newProgress;
343 fps = batchSize * 1000.0 / statistics.getMedianLatency();
344 totalDuration = std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
345 progressBar.finish();
347 std::cout << "[Step 7/8] ";
348 if (FLAGS_niter != 0) {
349 std::cout << "Start inference asynchronously (" << FLAGS_niter <<
350 " async inference executions, " << FLAGS_nireq <<
351 " inference requests in parallel)" << std::endl;
352 progressBarTotalCount = FLAGS_niter + FLAGS_nireq - 1;
354 std::cout << std::endl << "Start inference asynchronously (" << durationInNanoseconds * 0.000001 <<
355 " ms duration, " << FLAGS_nireq <<
356 " inference requests in parallel)" << std::endl;
357 progressBarTotalCount = 1000;
361 size_t currentInference = 0ULL;
362 bool requiredInferenceRequestsWereExecuted = false;
363 long long previousInference = 1LL - FLAGS_nireq;
365 // warming up - out of scope
366 inferRequests[0]->startAsync();
367 inferRequests[0]->wait();
369 const auto startTime = Time::now();
370 auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
372 /** Start inference & calculate performance **/
373 /** to use FLAGS_niter + FLAGS_nireq - 1 to guarantee that last infer requests are executed in the same conditions **/
374 progressBar.newBar(progressBarTotalCount);
375 while ((!requiredInferenceRequestsWereExecuted) ||
376 (iteration < FLAGS_niter + FLAGS_nireq - 1) ||
377 ((FLAGS_niter == 0LL) && (execTime < durationInNanoseconds))) {
378 // start new inference
379 inferRequests[currentInference]->startAsync();
381 // wait the latest inference execution if exists
382 if (previousInference >= 0) {
383 inferRequests[previousInference]->wait();
384 // update statistics with PM counters only in case of detailed or median reports
385 statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
386 inferRequests[previousInference]->getPerformanceCounts() : emptyStat,
387 inferRequests[previousInference]->getExecTime());
391 if (currentInference >= FLAGS_nireq) {
392 currentInference = 0;
393 requiredInferenceRequestsWereExecuted = true;
397 if (previousInference >= FLAGS_nireq) {
398 previousInference = 0;
403 if (FLAGS_niter > 0) {
404 progressBar.addProgress(1);
406 execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
407 // calculate how many progress intervals are covered by current iteration.
408 // depends on the current iteration time and time of each progress interval.
409 // Previously covered progress intervals must be skipped.
410 auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount;
411 size_t newProgress = execTime / progressIntervalTime - progressCnt;
412 progressBar.addProgress(newProgress);
413 progressCnt += newProgress;
417 // wait the latest inference executions
418 for (size_t notCompletedIndex = 0ULL; notCompletedIndex < (FLAGS_nireq - 1); ++notCompletedIndex) {
419 if (previousInference >= 0) {
420 inferRequests[previousInference]->wait();
421 // update statistics with PM counters only in case of detailed or median reports
422 statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ?
423 inferRequests[previousInference]->getPerformanceCounts() : emptyStat,
424 inferRequests[previousInference]->getExecTime());
428 if (previousInference >= FLAGS_nireq) {
429 previousInference = 0LL;
432 totalDuration = std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
433 fps = batchSize * 1000.0 * iteration / totalDuration;
434 progressBar.finish();
437 std::cout << "[Step 8/8] Dump statistics report" << std::endl;
438 progressBar.newBar(1);
439 statistics.dump(fps, iteration, totalDuration);
441 if (!FLAGS_exec_graph_path.empty()) {
442 CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo();
443 execGraphInfo.serialize(FLAGS_exec_graph_path);
444 slog::info << "executable graph is stored to " << FLAGS_exec_graph_path << slog::endl;
446 progressBar.addProgress(1);
447 progressBar.finish();
449 std::cout << "Latency: " << statistics.getMedianLatency() << " ms" << std::endl;
450 std::cout << "Throughput: " << fps << " FPS" << std::endl;
451 } catch (const std::exception& ex) {
452 slog::err << ex.what() << slog::endl;
459 long long getDurationInNanoseconds(const std::string& device) {
460 static const std::vector<std::pair<std::string, long long>> deviceDurationsInSeconds{
471 for (const auto& deviceDurationInSeconds : deviceDurationsInSeconds) {
472 if (device.find(deviceDurationInSeconds.first) != std::string::npos) {
473 duration = std::max(duration, deviceDurationInSeconds.second);
477 if (duration == 0LL) {
478 const auto unknownDeviceIt = find_if(
479 deviceDurationsInSeconds.begin(),
480 deviceDurationsInSeconds.end(),
481 [](std::pair<std::string, long long> deviceDuration) { return deviceDuration.first == "UNKNOWN"; });
483 if (unknownDeviceIt == deviceDurationsInSeconds.end()) {
484 throw std::logic_error("UNKNOWN device was not found in device duration list");
486 duration = unknownDeviceIt->second;
487 slog::warn << "Default duration " << duration << " seconds for unknown device '" << device << "' is used" << slog::endl;
490 return duration * 1000000000LL;
493 void fillBlobWithImage(
494 Blob::Ptr& inputBlob,
495 const std::vector<std::string>& filePaths,
496 const size_t& batchSize,
497 const InferenceEngine::InputInfo& info) {
499 auto inputBlobData = inputBlob->buffer().as<uint8_t*>();
500 const SizeVector& inputBlobDims = inputBlob->dims();
502 slog::info << "Network Input dimensions (" << info.getTensorDesc().getLayout() << "): ";
503 for (const auto& i : info.getTensorDesc().getDims()) {
504 slog::info << i << " ";
506 slog::info << slog::endl;
508 /** Collect images data ptrs **/
509 std::vector<std::shared_ptr<uint8_t>> vreader;
510 vreader.reserve(batchSize);
512 for (size_t i = 0ULL, inputIndex = 0ULL; i < batchSize; i++, inputIndex++) {
513 if (inputIndex >= filePaths.size()) {
517 slog::info << "Prepare image " << filePaths[inputIndex] << slog::endl;
518 FormatReader::ReaderPtr reader(filePaths[inputIndex].c_str());
519 if (reader.get() == nullptr) {
520 slog::warn << "Image " << filePaths[inputIndex] << " cannot be read!" << slog::endl << slog::endl;
524 /** Getting image data **/
525 std::shared_ptr<uint8_t> imageData(reader->getData(info.getDims()[0], info.getDims()[1]));
527 vreader.push_back(imageData);
531 /** Fill input tensor with images. First b channel, then g and r channels **/
532 const size_t numChannels = inputBlobDims[2];
533 const size_t imageSize = inputBlobDims[1] * inputBlobDims[0];
534 /** Iterate over all input images **/
535 for (size_t imageId = 0; imageId < vreader.size(); ++imageId) {
536 /** Iterate over all pixel in image (b,g,r) **/
537 for (size_t pid = 0; pid < imageSize; pid++) {
538 /** Iterate over all channels **/
539 for (size_t ch = 0; ch < numChannels; ++ch) {
540 /** [images stride + channels stride + pixel id ] all in bytes **/
541 inputBlobData[imageId * imageSize * numChannels + ch * imageSize + pid] = vreader.at(imageId).get()[pid*numChannels + ch];