1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
6 #include <os/windows/w_dirent.h>
23 #include <unordered_map>
27 #include <gflags/gflags.h>
28 #include <opencv2/opencv.hpp>
30 #include "inference_engine.hpp"
31 #include "ext_list.hpp"
33 #include "vpu/vpu_plugin_config.hpp"
34 #include "samples/common.hpp"
35 #include "samples/slog.hpp"
37 #include "perfcheck.h"
40 static bool parseCommandLine(int *argc, char ***argv) {
41 gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
48 if (FLAGS_m.empty()) {
49 throw std::invalid_argument("Path to model xml file is required");
52 if (FLAGS_num_iterations < MIN_ITERATIONS) {
53 throw std::invalid_argument("Number of iterations must be not smaller than 1000. "
54 "Got " + std::to_string(FLAGS_num_iterations));
57 if (MAX_NETWORKS < FLAGS_num_networks) {
58 throw std::invalid_argument("Only number of networks not greater than " + std::to_string(MAX_NETWORKS) + " "
59 "is supported. Got " + std::to_string(FLAGS_num_networks));
62 if (FLAGS_d.empty()) {
63 throw std::invalid_argument("Plugin name is required");
67 std::stringstream message;
68 message << "Unknown arguments: ";
69 for (auto arg = 1; arg < *argc; arg++) {
75 throw std::invalid_argument(message.str());
81 static std::map<std::string, std::string> parseConfig(const std::string &configName, char comment = '#') {
82 std::map<std::string, std::string> config = {};
84 std::ifstream file(configName);
85 if (!file.is_open()) {
89 std::string key, value;
90 while (file >> key >> value) {
91 if (key.empty() || key[0] == comment) {
100 static std::size_t getNumberRequests(const std::string &plugin) {
101 static const std::unordered_map<std::string, std::size_t> supported_plugins = {
107 auto device = plugin;
108 if (plugin.find("HETERO:") == 0) {
109 auto separator = plugin.find(",");
110 auto deviceBegin = std::string("HETERO:").size();
111 auto deviceEnd = separator == std::string::npos ? plugin.size() : separator;
112 device = plugin.substr(deviceBegin, deviceEnd - deviceBegin);
115 auto num_requests = supported_plugins.find(device);
116 return num_requests == supported_plugins.end() ? 1 : num_requests->second;
119 #if defined(WIN32) || defined(__APPLE__)
120 typedef std::chrono::time_point<std::chrono::steady_clock> time_point;
122 typedef std::chrono::time_point<std::chrono::system_clock> time_point;
125 static void printFPS(std::size_t num_requests, std::size_t num_intervals, const std::vector<time_point> &points) {
126 std::size_t num_exclude = 2 * num_requests;
127 /* evaluate from the end of previous */
128 std::size_t first_point = num_exclude - 1;
129 std::size_t last_point = points.size() - num_exclude;
130 auto begin = points[first_point];
131 auto end = points[last_point - 1];
133 using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
135 auto num_iterations = last_point - first_point - 1;
136 auto total = std::chrono::duration_cast<ms>(end - begin).count();
137 auto avg_fps = static_cast<double>(num_iterations) * 1000.0 * FLAGS_batch / total;
139 auto min_fps = std::numeric_limits<double>::max();
140 auto max_fps = std::numeric_limits<double>::min();
141 double step = total / num_intervals;
142 std::size_t first_point_in_interval = first_point + 1;
143 auto first_time_in_interval = std::chrono::time_point_cast<ms>(begin);
144 for (std::size_t interval = 0; interval < num_intervals; interval++) {
145 std::size_t num_points_in_interval = 0;
146 auto last_time_in_interval = first_time_in_interval + ms(step);
147 if (interval == num_intervals - 1) {
148 last_time_in_interval = end;
151 while (first_point_in_interval + num_points_in_interval < last_point &&
152 points[first_point_in_interval + num_points_in_interval] <= last_time_in_interval) {
153 num_points_in_interval++;
156 double fps = num_points_in_interval * FLAGS_batch / step * 1000;
157 min_fps = std::min(min_fps, fps);
158 max_fps = std::max(max_fps, fps);
160 first_point_in_interval += num_points_in_interval;
161 first_time_in_interval = last_time_in_interval;
164 std::cout << std::endl;
165 std::cout << "Total time: " << total << " ms";
166 std::cout << std::endl;
168 std::cout << "Num iterations: " << num_iterations << std::endl;
169 std::cout << "Batch: " << FLAGS_batch << std::endl;
171 std::cout << "Min FPS: " << min_fps << std::endl;
172 std::cout << "Avg FPS: " << avg_fps << std::endl;
173 std::cout << "Max FPS: " << max_fps << std::endl;
177 static bool isImage(const T &blob) {
178 auto descriptor = blob->getTensorDesc();
179 if (descriptor.getLayout() != InferenceEngine::NCHW) {
183 auto channels = descriptor.getDims()[1];
184 return channels == 3;
187 static std::vector<std::string> extractFilesByExtension(const std::string &directory, const std::string &extension) {
188 std::vector<std::string> files;
190 DIR *dir = opendir(directory.c_str());
192 throw std::invalid_argument("Can not open " + directory);
195 auto getExtension = [](const std::string &name) {
196 auto extensionPosition = name.rfind('.', name.size());
197 return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1);
200 dirent *ent = nullptr;
201 while ((ent = readdir(dir))) {
202 std::string file_name = ent->d_name;
203 if (getExtension(file_name) != extension) {
207 std::stringstream stream;
208 stream << directory << "/" << file_name;
210 auto full_file_name = stream.str();
213 if (stat(full_file_name.c_str(), &st) != 0) {
217 bool is_directory = (st.st_mode & S_IFDIR) != 0;
222 files.push_back(full_file_name);
230 static float asfloat(uint32_t v) {
239 static short f32tof16(float x) {
240 static float min16 = asfloat((127 - 14) << 23);
242 static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
243 static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
245 static constexpr std::uint32_t EXP_MASK_F32 = 0x7F800000U;
253 uint32_t s = (v.u >> 16) & 0x8000;
257 if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
258 if (v.u & 0x007FFFFF) {
259 return static_cast<short>(s | (v.u >> (23 - 10)) | 0x0200);
261 return static_cast<short>(s | (v.u >> (23 - 10)));
265 float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
268 if (v.f < min16 * 0.5f) {
269 return static_cast<short>(s);
273 return static_cast<short>(s | (1 << 10));
277 return static_cast<short>(max16f16 | s);
280 v.u -= ((127 - 15) << 23);
284 return static_cast<short>(v.u | s);
287 static void loadImage(const std::string &imageFilename, InferenceEngine::Blob::Ptr &blob) {
288 InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
290 cv::Mat image = cv::imread(imageFilename);
292 throw std::invalid_argument("Can not read image from " + imageFilename);
295 std::size_t batch = blob->dims()[3];
296 std::size_t w = blob->dims()[0];
297 std::size_t h = blob->dims()[1];
298 auto img_w = static_cast<std::size_t>(image.cols);
299 auto img_h = static_cast<std::size_t>(image.rows);
301 auto numBlobChannels = blob->dims()[2];
302 auto numImageChannels = static_cast<std::size_t>(image.channels());
303 if (numBlobChannels != numImageChannels && numBlobChannels != 1) {
304 throw std::invalid_argument("Input channels mismatch: image channels " + std::to_string(numImageChannels) +
305 ", network channels " + std::to_string(numBlobChannels) +
306 ", expecting count of image channels are equal to count if network channels"
307 "or count of network channels are equal to 1");
310 auto nPixels = w * h;
311 unsigned char *RGB8 = image.data;
312 float xscale = 1.0f * img_w / w;
313 float yscale = 1.0f * img_h / h;
315 for (std::size_t n = 0; n != batch; n++) {
316 for (std::size_t i = 0; i < h; ++i) {
317 auto y = static_cast<std::size_t>(std::floor((i + 0.5f) * yscale));
318 for (std::size_t j = 0; j < w; ++j) {
319 auto x = static_cast<std::size_t>(std::floor((j + 0.5f) * xscale));
320 for (std::size_t k = 0; k < numBlobChannels; k++) {
321 float value = 1.0f * RGB8[(y * img_w + x) * numImageChannels + k];
322 if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
323 if (tensDesc.getLayout() == InferenceEngine::NHWC) {
324 blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = f32tof16(value);
326 blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = f32tof16(value);
329 if (tensDesc.getLayout() == InferenceEngine::NHWC) {
330 blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = value;
332 blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = value;
341 static void loadBinaryTensor(const std::string &binaryFileName, InferenceEngine::Blob::Ptr &blob) {
342 InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
344 std::ifstream binaryFile(binaryFileName, std::ios_base::binary | std::ios_base::ate);
346 throw std::invalid_argument("Can not open \"" + binaryFileName + "\"");
349 auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
350 binaryFile.seekg(0, std::ios_base::beg);
351 if (!binaryFile.good()) {
352 throw std::invalid_argument("Can not read \"" + binaryFileName + "\"");
355 auto networkSize = blob->size() * sizeof(float);
356 if (fileSize != networkSize) {
357 throw std::invalid_argument("File \"" + binaryFileName + "\" contains " + std::to_string(fileSize) + " bytes "
358 "but network expects " + std::to_string(networkSize));
361 for (std::size_t i = 0; i < blob->size(); i++) {
363 binaryFile.read(reinterpret_cast<char *>(&src), sizeof(float));
364 if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
365 blob->buffer().as<std::int16_t *>()[i] = f32tof16(src);
367 blob->buffer().as<float *>()[i] = src;
372 static void loadInputs(std::size_t requestIdx, const std::vector<std::string> &images,
373 const std::vector<std::string> &binaries, InferenceEngine::InferRequest &request,
374 InferenceEngine::CNNNetwork &network) {
375 for (auto &&input : network.getInputsInfo()) {
376 auto blob = request.GetBlob(input.first);
379 loadImage(images[requestIdx % images.size()], blob);
381 loadBinaryTensor(binaries[requestIdx % binaries.size()], blob);
386 int main(int argc, char *argv[]) {
388 slog::info << "Inference Engine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl;
390 if (!parseCommandLine(&argc, &argv)) {
394 std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
395 slog::info << "Loading network files:" <<
396 slog::endl << "\t" << FLAGS_m <<
397 slog::endl << "\t" << binFileName <<
400 InferenceEngine::CNNNetReader networkReader;
401 networkReader.ReadNetwork(FLAGS_m);
402 networkReader.ReadWeights(binFileName);
404 auto network = networkReader.getNetwork();
405 network.setBatchSize(FLAGS_batch);
407 if (FLAGS_d.find("MYRIAD") != std::string::npos || FLAGS_d.find("HDDL") != std::string::npos) {
409 * on VPU devices FP16 precision allows avoid extra conversion operations and shows better performance
411 for (auto &&input : network.getInputsInfo()) {
412 input.second->setPrecision(InferenceEngine::Precision::FP16);
415 for (auto &&output : network.getOutputsInfo()) {
416 output.second->setPrecision(InferenceEngine::Precision::FP16);
420 auto plugin = InferenceEngine::PluginDispatcher({FLAGS_pp}).getPluginByDevice(FLAGS_d);
422 /* If CPU device, load default library with extensions that comes with the product */
423 if (FLAGS_d.find("CPU") != std::string::npos) {
425 * cpu_extensions library is compiled from "extension" folder containing
426 * custom MKLDNNPlugin layer implementations. These layers are not supported
427 * by mkldnn, but they can be useful for inferencing custom topologies.
429 plugin.AddExtension(std::make_shared<InferenceEngine::Extensions::Cpu::CpuExtensions>());
432 if (!FLAGS_l.empty()) {
433 plugin.AddExtension(InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l));
434 slog::info << "CPU Extension loaded: " << FLAGS_l << slog::endl;
437 if (!FLAGS_c.empty()) {
438 /* clDNN Extensions are loaded from an .xml description and OpenCL kernel files */
439 plugin.SetConfig({{InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c}});
440 slog::info << "GPU Extension loaded: " << FLAGS_c << slog::endl;
443 auto config = parseConfig(FLAGS_config);
444 std::vector<InferenceEngine::ExecutableNetwork> networks(FLAGS_num_networks);
445 for (std::size_t net = 0; net < networks.size(); ++net) {
446 slog::info << "Loading network " << net;
447 if (FLAGS_d.find("FPGA") != std::string::npos) {
448 if (FLAGS_num_fpga_devices != 1) {
449 config[InferenceEngine::PluginConfigParams::KEY_DEVICE_ID] = std::to_string(net % FLAGS_num_fpga_devices);
450 slog::info << " to device " << (net % FLAGS_num_fpga_devices);
453 slog::info << slog::endl;
455 networks[net] = plugin.LoadNetwork(network, config);
457 slog::info << "All networks are loaded" << slog::endl;
459 auto num_requests = FLAGS_num_requests == 0 ? getNumberRequests(FLAGS_d) : FLAGS_num_requests;
461 auto images = extractFilesByExtension(FLAGS_inputs_dir, "bmp");
462 auto hasImageInput = [](const InferenceEngine::CNNNetwork &net) {
463 auto inputs = net.getInputsInfo();
464 auto isImageInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
465 return isImage(input.second);
467 return std::any_of(inputs.begin(), inputs.end(), isImageInput);
470 if (hasImageInput(network) && images.empty()) {
471 throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain images for network");
474 auto binaries = extractFilesByExtension(FLAGS_inputs_dir, "bin");
475 auto hasBinaryInput = [](const InferenceEngine::CNNNetwork &net) {
476 auto inputs = net.getInputsInfo();
477 auto isBinaryInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
478 return !isImage(input.second);
480 return std::any_of(inputs.begin(), inputs.end(), isBinaryInput);
483 if (hasBinaryInput(network) && binaries.empty()) {
484 throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain binaries for network");
487 std::size_t iteration{0};
488 std::mutex dump_time;
489 std::atomic<std::size_t> num_finished{0};
491 std::promise<void> done;
492 num_requests *= FLAGS_num_networks;
493 std::size_t num_iterations = 2 * num_requests + FLAGS_num_iterations + 2 * num_requests;
495 std::vector<InferenceEngine::InferRequest> requests(num_requests);
496 std::vector<time_point> time_points(num_iterations);
498 using callback_t = std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>;
500 for (std::size_t request = 0; request < num_requests; ++request) {
501 requests[request] = networks[request % networks.size()].CreateInferRequest();
503 loadInputs(request, images, binaries, requests[request], network);
505 callback_t callback =
506 [num_requests, num_iterations, &iteration, &time_points, &dump_time, &num_finished, &done]
507 (InferenceEngine::InferRequest inferRequest, InferenceEngine::StatusCode code) {
508 if (code != InferenceEngine::StatusCode::OK) {
509 THROW_IE_EXCEPTION << "Infer request failed with code " << code;
512 std::size_t current_finished_iteration = 0;
514 std::lock_guard<std::mutex> lock(dump_time);
516 current_finished_iteration = iteration++;
517 if (current_finished_iteration < num_iterations) {
518 time_points[current_finished_iteration] = std::chrono::high_resolution_clock::now();
522 if (current_finished_iteration < num_iterations - 1) {
523 inferRequest.StartAsync();
525 if (++num_finished == num_requests) {
531 requests[request].SetCompletionCallback<callback_t>(callback);
534 auto doneFuture = done.get_future();
536 for (auto &&request : requests) {
537 request.StartAsync();
542 printFPS(num_requests, 10, time_points);
543 } catch (const std::exception &error) {
544 slog::err << error.what() << slog::endl;
547 slog::err << "Unknown/internal exception happened." << slog::endl;