inference-engine/samples/perfcheck/main.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #if defined(_WIN32)
   6 #include <os/windows/w_dirent.h>
   7 #else
   8 #include <sys/stat.h>
   9 #include <dirent.h>
  10 #endif
  11
  12 #include <fstream>
  13 #include <sstream>
  14 #include <iomanip>
  15 #include <memory>
  16 #include <map>
  17 #include <cmath>
  18 #include <future>
  19 #include <atomic>
  20 #include <algorithm>
  21 #include <string>
  22 #include <vector>
  23 #include <unordered_map>
  24 #include <mutex>
  25 #include <limits>
  26
  27 #include <gflags/gflags.h>
  28 #include <opencv2/opencv.hpp>
  29
  30 #include "inference_engine.hpp"
  31 #include "ext_list.hpp"
  32
  33 #include "vpu/vpu_plugin_config.hpp"
  34 #include "samples/common.hpp"
  35 #include "samples/slog.hpp"
  36
  37 #include "perfcheck.h"
  38
  39
  40 static bool parseCommandLine(int *argc, char ***argv) {
  41     gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
  42
  43     if (FLAGS_h) {
  44         showUsage();
  45         return false;
  46     }
  47
  48     if (FLAGS_m.empty()) {
  49         throw std::invalid_argument("Path to model xml file is required");
  50     }
  51
  52     if (FLAGS_num_iterations < MIN_ITERATIONS) {
  53         throw std::invalid_argument("Number of iterations must be not smaller than 1000. "
  54                                     "Got " + std::to_string(FLAGS_num_iterations));
  55     }
  56
  57     if (MAX_NETWORKS < FLAGS_num_networks) {
  58         throw std::invalid_argument("Only number of networks not greater than " + std::to_string(MAX_NETWORKS) + " "
  59                                     "is supported. Got " + std::to_string(FLAGS_num_networks));
  60     }
  61
  62     if (FLAGS_d.empty()) {
  63         throw std::invalid_argument("Plugin name is required");
  64     }
  65
  66     if (1 < *argc) {
  67         std::stringstream message;
  68         message << "Unknown arguments: ";
  69         for (auto arg = 1; arg < *argc; arg++) {
  70             message << argv[arg];
  71             if (arg < *argc) {
  72                 message << " ";
  73             }
  74         }
  75         throw std::invalid_argument(message.str());
  76     }
  77
  78     return true;
  79 }
  80
  81 static std::map<std::string, std::string> parseConfig(const std::string &configName, char comment = '#') {
  82     std::map<std::string, std::string> config = {};
  83
  84     std::ifstream file(configName);
  85     if (!file.is_open()) {
  86         return config;
  87     }
  88
  89     std::string key, value;
  90     while (file >> key >> value) {
  91         if (key.empty() || key[0] == comment) {
  92             continue;
  93         }
  94         config[key] = value;
  95     }
  96
  97     return config;
  98 }
  99
 100 static std::size_t getNumberRequests(const std::string &plugin) {
 101     static const std::unordered_map<std::string, std::size_t> supported_plugins = {
 102         { "MYRIAD", 4   },
 103         { "HDDL",   100 },
 104         { "FPGA",   3   },
 105     };
 106
 107     auto device = plugin;
 108     if (plugin.find("HETERO:") == 0) {
 109         auto separator   = plugin.find(",");
 110         auto deviceBegin = std::string("HETERO:").size();
 111         auto deviceEnd   = separator == std::string::npos ? plugin.size() : separator;
 112         device = plugin.substr(deviceBegin, deviceEnd - deviceBegin);
 113     }
 114
 115     auto num_requests = supported_plugins.find(device);
 116     return num_requests == supported_plugins.end() ? 1 : num_requests->second;
 117 }
 118
 119 #if defined(WIN32) || defined(__APPLE__)
 120 typedef std::chrono::time_point<std::chrono::steady_clock> time_point;
 121 #else
 122 typedef std::chrono::time_point<std::chrono::system_clock> time_point;
 123 #endif
 124
 125 static void printFPS(std::size_t num_requests, std::size_t num_intervals, const std::vector<time_point> &points) {
 126     std::size_t num_exclude = 2 * num_requests;
 127     /* evaluate from the end of previous */
 128     std::size_t first_point = num_exclude - 1;
 129     std::size_t last_point  = points.size() - num_exclude;
 130     auto begin = points[first_point];
 131     auto end   = points[last_point - 1];
 132
 133     using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
 134
 135     auto num_iterations = last_point - first_point - 1;
 136     auto total = std::chrono::duration_cast<ms>(end - begin).count();
 137     auto avg_fps = static_cast<double>(num_iterations) * 1000.0 * FLAGS_batch / total;
 138
 139     auto min_fps = std::numeric_limits<double>::max();
 140     auto max_fps = std::numeric_limits<double>::min();
 141     double step = total / num_intervals;
 142     std::size_t first_point_in_interval = first_point + 1;
 143     auto first_time_in_interval = std::chrono::time_point_cast<ms>(begin);
 144     for (std::size_t interval = 0; interval < num_intervals; interval++) {
 145         std::size_t num_points_in_interval = 0;
 146         auto last_time_in_interval = first_time_in_interval + ms(step);
 147         if (interval == num_intervals - 1) {
 148             last_time_in_interval = end;
 149         }
 150
 151         while (first_point_in_interval + num_points_in_interval < last_point &&
 152                points[first_point_in_interval + num_points_in_interval] <= last_time_in_interval) {
 153             num_points_in_interval++;
 154         }
 155
 156         double fps = num_points_in_interval * FLAGS_batch / step * 1000;
 157         min_fps = std::min(min_fps, fps);
 158         max_fps = std::max(max_fps, fps);
 159
 160         first_point_in_interval += num_points_in_interval;
 161         first_time_in_interval = last_time_in_interval;
 162     }
 163
 164     std::cout << std::endl;
 165     std::cout << "Total time:     " << total << " ms";
 166     std::cout << std::endl;
 167
 168     std::cout << "Num iterations: " << num_iterations << std::endl;
 169     std::cout << "Batch:          " << FLAGS_batch << std::endl;
 170
 171     std::cout << "Min FPS:        " << min_fps << std::endl;
 172     std::cout << "Avg FPS:        " << avg_fps << std::endl;
 173     std::cout << "Max FPS:        " << max_fps << std::endl;
 174 }
 175
 176 template<typename T>
 177 static bool isImage(const T &blob) {
 178     auto descriptor = blob->getTensorDesc();
 179     if (descriptor.getLayout() != InferenceEngine::NCHW) {
 180         return false;
 181     }
 182
 183     auto channels = descriptor.getDims()[1];
 184     return channels == 3;
 185 }
 186
 187 static std::vector<std::string> extractFilesByExtension(const std::string &directory, const std::string &extension) {
 188     std::vector<std::string> files;
 189
 190     DIR *dir = opendir(directory.c_str());
 191     if (!dir) {
 192         throw std::invalid_argument("Can not open " + directory);
 193     }
 194
 195     auto getExtension = [](const std::string &name) {
 196         auto extensionPosition = name.rfind('.', name.size());
 197         return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1);
 198     };
 199
 200     dirent *ent = nullptr;
 201     while ((ent = readdir(dir))) {
 202         std::string file_name = ent->d_name;
 203         if (getExtension(file_name) != extension) {
 204             continue;
 205         }
 206
 207         std::stringstream stream;
 208         stream << directory << "/" << file_name;
 209
 210         auto full_file_name = stream.str();
 211
 212         struct stat st = {};
 213         if (stat(full_file_name.c_str(), &st) != 0) {
 214             continue;
 215         }
 216
 217         bool is_directory = (st.st_mode & S_IFDIR) != 0;
 218         if (is_directory) {
 219             continue;
 220         }
 221
 222         files.push_back(full_file_name);
 223     }
 224
 225     closedir(dir);
 226
 227     return files;
 228 }
 229
 230 static float asfloat(uint32_t v) {
 231     union {
 232         float f;
 233         std::uint32_t u;
 234     } converter = {0};
 235     converter.u = v;
 236     return converter.f;
 237 }
 238
 239 static short f32tof16(float x) {
 240     static float min16 = asfloat((127 - 14) << 23);
 241
 242     static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
 243     static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
 244
 245     static constexpr std::uint32_t EXP_MASK_F32 = 0x7F800000U;
 246
 247     union {
 248         float f;
 249         uint32_t u;
 250     } v = {0};
 251     v.f = x;
 252
 253     uint32_t s = (v.u >> 16) & 0x8000;
 254
 255     v.u &= 0x7FFFFFFF;
 256
 257     if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
 258         if (v.u & 0x007FFFFF) {
 259             return static_cast<short>(s | (v.u >> (23 - 10)) | 0x0200);
 260         } else {
 261             return static_cast<short>(s | (v.u >> (23 - 10)));
 262         }
 263     }
 264
 265     float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
 266     v.f += halfULP;
 267
 268     if (v.f < min16 * 0.5f) {
 269         return static_cast<short>(s);
 270     }
 271
 272     if (v.f < min16) {
 273         return static_cast<short>(s | (1 << 10));
 274     }
 275
 276     if (v.f >= max16) {
 277         return static_cast<short>(max16f16 | s);
 278     }
 279
 280     v.u -= ((127 - 15) << 23);
 281
 282     v.u >>= (23 - 10);
 283
 284     return static_cast<short>(v.u | s);
 285 }
 286
 287 static void loadImage(const std::string &imageFilename, InferenceEngine::Blob::Ptr &blob) {
 288     InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
 289
 290     cv::Mat image = cv::imread(imageFilename);
 291     if (image.empty()) {
 292         throw std::invalid_argument("Can not read image from " + imageFilename);
 293     }
 294
 295     std::size_t batch = blob->dims()[3];
 296     std::size_t w = blob->dims()[0];
 297     std::size_t h = blob->dims()[1];
 298     auto img_w = static_cast<std::size_t>(image.cols);
 299     auto img_h = static_cast<std::size_t>(image.rows);
 300
 301     auto numBlobChannels = blob->dims()[2];
 302     auto numImageChannels = static_cast<std::size_t>(image.channels());
 303     if (numBlobChannels != numImageChannels && numBlobChannels != 1) {
 304         throw std::invalid_argument("Input channels mismatch: image channels " + std::to_string(numImageChannels) +
 305                                     ", network channels " + std::to_string(numBlobChannels) +
 306                                     ", expecting count of image channels are equal to count if network channels"
 307                                     "or count of network channels are equal to 1");
 308     }
 309
 310     auto nPixels = w * h;
 311     unsigned char *RGB8 = image.data;
 312     float xscale = 1.0f * img_w / w;
 313     float yscale = 1.0f * img_h / h;
 314
 315     for (std::size_t n = 0; n != batch; n++) {
 316         for (std::size_t i = 0; i < h; ++i) {
 317             auto y = static_cast<std::size_t>(std::floor((i + 0.5f) * yscale));
 318             for (std::size_t j = 0; j < w; ++j) {
 319                 auto x = static_cast<std::size_t>(std::floor((j + 0.5f) * xscale));
 320                 for (std::size_t k = 0; k < numBlobChannels; k++) {
 321                     float value = 1.0f * RGB8[(y * img_w + x) * numImageChannels + k];
 322                     if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
 323                         if (tensDesc.getLayout() == InferenceEngine::NHWC) {
 324                             blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = f32tof16(value);
 325                         } else {
 326                             blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = f32tof16(value);
 327                         }
 328                     } else {
 329                         if (tensDesc.getLayout() == InferenceEngine::NHWC) {
 330                             blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = value;
 331                         } else {
 332                             blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = value;
 333                         }
 334                     }
 335                 }
 336             }
 337         }
 338     }
 339 }
 340
 341 static void loadBinaryTensor(const std::string &binaryFileName, InferenceEngine::Blob::Ptr &blob) {
 342     InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
 343
 344     std::ifstream binaryFile(binaryFileName, std::ios_base::binary | std::ios_base::ate);
 345     if (!binaryFile) {
 346         throw std::invalid_argument("Can not open \"" + binaryFileName + "\"");
 347     }
 348
 349     auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
 350     binaryFile.seekg(0, std::ios_base::beg);
 351     if (!binaryFile.good()) {
 352         throw std::invalid_argument("Can not read \"" + binaryFileName + "\"");
 353     }
 354
 355     auto networkSize = blob->size() * sizeof(float);
 356     if (fileSize != networkSize) {
 357         throw std::invalid_argument("File \"" + binaryFileName + "\" contains " + std::to_string(fileSize) + " bytes "
 358                                     "but network expects " + std::to_string(networkSize));
 359     }
 360
 361     for (std::size_t i = 0; i < blob->size(); i++) {
 362         float src = 0.f;
 363         binaryFile.read(reinterpret_cast<char *>(&src), sizeof(float));
 364         if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
 365             blob->buffer().as<std::int16_t *>()[i] = f32tof16(src);
 366         } else {
 367             blob->buffer().as<float *>()[i] = src;
 368         }
 369     }
 370 }
 371
 372 static void loadInputs(std::size_t requestIdx, const std::vector<std::string> &images,
 373                        const std::vector<std::string> &binaries, InferenceEngine::InferRequest &request,
 374                        InferenceEngine::CNNNetwork &network) {
 375     for (auto &&input : network.getInputsInfo()) {
 376         auto blob = request.GetBlob(input.first);
 377
 378         if (isImage(blob)) {
 379             loadImage(images[requestIdx % images.size()], blob);
 380         } else {
 381             loadBinaryTensor(binaries[requestIdx % binaries.size()], blob);
 382         }
 383     }
 384 }
 385
 386 int main(int argc, char *argv[]) {
 387     try {
 388         slog::info << "Inference Engine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl;
 389
 390         if (!parseCommandLine(&argc, &argv)) {
 391             return EXIT_SUCCESS;
 392         }
 393
 394         std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
 395         slog::info << "Loading network files:" <<
 396             slog::endl << "\t" << FLAGS_m <<
 397             slog::endl << "\t" << binFileName <<
 398         slog::endl;
 399
 400         InferenceEngine::CNNNetReader networkReader;
 401         networkReader.ReadNetwork(FLAGS_m);
 402         networkReader.ReadWeights(binFileName);
 403
 404         auto network = networkReader.getNetwork();
 405         network.setBatchSize(FLAGS_batch);
 406
 407         if (FLAGS_d.find("MYRIAD") != std::string::npos || FLAGS_d.find("HDDL") != std::string::npos) {
 408             /**
 409              * on VPU devices FP16 precision allows avoid extra conversion operations and shows better performance
 410              **/
 411             for (auto &&input : network.getInputsInfo()) {
 412                 input.second->setPrecision(InferenceEngine::Precision::FP16);
 413             }
 414
 415             for (auto &&output : network.getOutputsInfo()) {
 416                 output.second->setPrecision(InferenceEngine::Precision::FP16);
 417             }
 418         }
 419
 420         auto plugin = InferenceEngine::PluginDispatcher({FLAGS_pp}).getPluginByDevice(FLAGS_d);
 421
 422         /* If CPU device, load default library with extensions that comes with the product */
 423         if (FLAGS_d.find("CPU") != std::string::npos) {
 424             /**
 425              * cpu_extensions library is compiled from "extension" folder containing
 426              * custom MKLDNNPlugin layer implementations. These layers are not supported
 427              * by mkldnn, but they can be useful for inferencing custom topologies.
 428              **/
 429             plugin.AddExtension(std::make_shared<InferenceEngine::Extensions::Cpu::CpuExtensions>());
 430         }
 431
 432         if (!FLAGS_l.empty()) {
 433             plugin.AddExtension(InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l));
 434             slog::info << "CPU Extension loaded: " << FLAGS_l << slog::endl;
 435         }
 436
 437         if (!FLAGS_c.empty()) {
 438             /* clDNN Extensions are loaded from an .xml description and OpenCL kernel files */
 439             plugin.SetConfig({{InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c}});
 440             slog::info << "GPU Extension loaded: " << FLAGS_c << slog::endl;
 441         }
 442
 443         auto config = parseConfig(FLAGS_config);
 444         std::vector<InferenceEngine::ExecutableNetwork> networks(FLAGS_num_networks);
 445         for (std::size_t net = 0; net < networks.size(); ++net) {
 446             slog::info << "Loading network " << net;
 447             if (FLAGS_d.find("FPGA") != std::string::npos) {
 448                 if (FLAGS_num_fpga_devices != 1) {
 449                     config[InferenceEngine::PluginConfigParams::KEY_DEVICE_ID] = std::to_string(net % FLAGS_num_fpga_devices);
 450                     slog::info << " to device " << (net % FLAGS_num_fpga_devices);
 451                 }
 452             }
 453             slog::info << slog::endl;
 454
 455             networks[net] = plugin.LoadNetwork(network, config);
 456         }
 457         slog::info << "All networks are loaded" << slog::endl;
 458
 459         auto num_requests = FLAGS_num_requests == 0 ? getNumberRequests(FLAGS_d) : FLAGS_num_requests;
 460
 461         auto images = extractFilesByExtension(FLAGS_inputs_dir, "bmp");
 462         auto hasImageInput = [](const InferenceEngine::CNNNetwork &net) {
 463             auto inputs = net.getInputsInfo();
 464             auto isImageInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
 465                 return isImage(input.second);
 466             };
 467             return std::any_of(inputs.begin(), inputs.end(), isImageInput);
 468         };
 469
 470         if (hasImageInput(network) && images.empty()) {
 471             throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain images for network");
 472         }
 473
 474         auto binaries = extractFilesByExtension(FLAGS_inputs_dir, "bin");
 475         auto hasBinaryInput = [](const InferenceEngine::CNNNetwork &net) {
 476             auto inputs = net.getInputsInfo();
 477             auto isBinaryInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
 478                 return !isImage(input.second);
 479             };
 480             return std::any_of(inputs.begin(), inputs.end(), isBinaryInput);
 481         };
 482
 483         if (hasBinaryInput(network) && binaries.empty()) {
 484             throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain binaries for network");
 485         }
 486
 487         std::size_t iteration{0};
 488         std::mutex dump_time;
 489         std::atomic<std::size_t> num_finished{0};
 490
 491         std::promise<void> done;
 492         num_requests *= FLAGS_num_networks;
 493         std::size_t num_iterations = 2 * num_requests + FLAGS_num_iterations + 2 * num_requests;
 494
 495         std::vector<InferenceEngine::InferRequest> requests(num_requests);
 496         std::vector<time_point> time_points(num_iterations);
 497
 498         using callback_t = std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>;
 499
 500         for (std::size_t request = 0; request < num_requests; ++request) {
 501             requests[request] = networks[request % networks.size()].CreateInferRequest();
 502
 503             loadInputs(request, images, binaries, requests[request], network);
 504
 505             callback_t callback =
 506                 [num_requests, num_iterations, &iteration, &time_points, &dump_time, &num_finished, &done]
 507                 (InferenceEngine::InferRequest inferRequest, InferenceEngine::StatusCode code) {
 508                 if (code != InferenceEngine::StatusCode::OK) {
 509                     THROW_IE_EXCEPTION << "Infer request failed with code " << code;
 510                 }
 511
 512                 std::size_t current_finished_iteration = 0;
 513                 {
 514                     std::lock_guard<std::mutex> lock(dump_time);
 515
 516                     current_finished_iteration = iteration++;
 517                     if (current_finished_iteration < num_iterations) {
 518                         time_points[current_finished_iteration] = std::chrono::high_resolution_clock::now();
 519                     }
 520                 }
 521
 522                 if (current_finished_iteration < num_iterations - 1) {
 523                     inferRequest.StartAsync();
 524                 } else {
 525                     if (++num_finished == num_requests) {
 526                         done.set_value();
 527                     }
 528                 }
 529             };
 530
 531             requests[request].SetCompletionCallback<callback_t>(callback);
 532         }
 533
 534         auto doneFuture = done.get_future();
 535
 536         for (auto &&request : requests) {
 537             request.StartAsync();
 538         }
 539
 540         doneFuture.wait();
 541
 542         printFPS(num_requests, 10, time_points);
 543     } catch (const std::exception &error) {
 544         slog::err << error.what() << slog::endl;
 545         return EXIT_FAILURE;
 546     } catch (...) {
 547         slog::err << "Unknown/internal exception happened." << slog::endl;
 548         return EXIT_FAILURE;
 549     }
 550
 551     return EXIT_SUCCESS;
 552 }