1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
7 #include <ie_plugin.hpp>
8 #include <description_buffer.hpp>
10 #include <ie_layouts.h>
11 #include <precision_utils.h>
13 #include <vpu/utils/perf_report.hpp>
14 #include <vpu/utils/ie_helpers.hpp>
16 #include "myriad_executable_network.h"
17 #include "myriad_infer_request.h"
20 using namespace vpu::MyriadPlugin;
21 using namespace InferenceEngine;
23 #define MEMCPY(dst, src, bytes) std::copy_n((src), (bytes), (dst))
25 MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
26 InferenceEngine::InputsDataMap networkInputs,
27 InferenceEngine::OutputsDataMap networkOutputs,
30 const std::vector<StageMetaInfo> &blobMetaData,
31 const std::shared_ptr<MyriadConfig> &myriadConfig,
32 const Logger::Ptr &log,
33 const MyriadExecutorPtr &executor) :
34 InferRequestInternal(networkInputs, networkOutputs), _executor(executor),
35 _log(log), _stagesMetaData(blobMetaData), _config(myriadConfig),
36 _inputInfo(inputInfo), _outputInfo(outputInfo),
37 _graphDesc(graphDesc) {
38 _deviceLayout = _config->compileConfig.hwOptimization ? NCHW : NHWC;
39 if (_config->compileConfig.forceLayout == ComputeLayout::NCHW)
41 if (_config->compileConfig.forceLayout == ComputeLayout::NHWC)
44 for (auto &networkInput : _networkInputs) {
45 // TODO: use TensorDesc instead of deprecated methods
46 SizeVector dims = networkInput.second->getDims();
47 Precision precision = networkInput.second->getInputPrecision();
48 Layout layout = networkInput.second->getTensorDesc().getLayout();
50 if (precision != Precision::FP32 &&
51 precision != Precision::FP16 &&
52 precision != Precision::U8) {
53 THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input precision: "
54 << precision << "! Supported precisions: FP32, FP16 and U8";
56 Blob::Ptr inputBlob = make_blob_with_precision(precision, layout, dims);
58 // allocate the input blob
59 // TODO We are allocating temporary input buffer of enough size. Wrap this buffer in blobs
60 inputBlob->allocate();
61 _inputs[networkInput.first] = inputBlob;
64 for (auto &networkOutput : _networkOutputs) {
65 SizeVector dims = networkOutput.second->dims;
66 Precision precision = networkOutput.second->getPrecision();
67 Layout layout = networkOutput.second->layout;
69 if (precision != Precision::FP32 &&
70 precision != Precision::FP16) {
71 THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output precision: "
72 << precision << "! Supported precisions: FP32, FP16";
74 Blob::Ptr outputBlob = make_blob_with_precision(precision, layout, dims);
75 // allocate the output blob
76 outputBlob->allocate();
77 _outputs[networkOutput.first] = outputBlob;
80 inputBuffer .resize(inputInfo.totalSize);
81 resultBuffer.resize(outputInfo.totalSize);
83 if (_networkOutputs.empty() || _networkInputs.empty()) {
84 THROW_IE_EXCEPTION << "Internal error: no information about network's output/input";
88 void MyriadInferRequest::InferImpl() {
93 void MyriadInferRequest::InferAsync() {
94 for (auto input : _inputs) {
95 auto const inputBlobPtr = input.second;
96 if (inputBlobPtr->precision() != Precision::FP16
97 && inputBlobPtr->precision() != Precision::FP32
98 && inputBlobPtr->precision() != Precision::U8)
99 THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input blob precision";
101 for (auto output : _outputs) {
102 auto const outputBlobPtr = output.second;
103 if (outputBlobPtr->precision() != Precision::FP16
104 && outputBlobPtr->precision() != Precision::FP32)
105 THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output blob precision";
108 // execute input pre-processing
109 execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
113 void* inputPtr = nullptr;
114 size_t inputSize = _inputInfo.totalSize;
116 if (_inputs.size() > 1) {
117 for (auto&& input : _inputs) {
118 auto inputBlob = input.second;
119 size_t byteSize = inputBlob->byteSize();
120 Layout layout = inputBlob->getTensorDesc().getLayout();
121 if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
122 // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
123 inputBlob = copyBlob(inputBlob, _deviceLayout);
126 const auto input_offset_it = _inputInfo.offset.find(input.first);
127 if (input_offset_it != _inputInfo.offset.end()) {
128 size_t required_buff_size = checked_cast<size_t>(input_offset_it->second) + byteSize;
129 IE_ASSERT(required_buff_size <= inputBuffer.size());
130 MEMCPY(&inputBuffer[input_offset_it->second], inputBlob->buffer().as<uint8_t*>(), byteSize);
134 inputPtr = inputBuffer.data();
136 auto dataName = _networkInputs.begin()->first;
137 auto foundInputBlob = _inputs.find(dataName);
138 if (foundInputBlob == _inputs.end())
139 THROW_IE_EXCEPTION << "Error: input [" << dataName << "] is not provided.";
141 tmpBlob = foundInputBlob->second;
142 Layout layout = tmpBlob->getTensorDesc().getLayout();
143 if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
144 // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
145 tmpBlob = copyBlob(tmpBlob, _deviceLayout);
148 inputPtr = tmpBlob->buffer();
151 _executor->queueInference(_graphDesc, inputPtr, inputSize, nullptr, 0);
154 void MyriadInferRequest::GetResult() {
155 _executor->getResult(_graphDesc, resultBuffer.data(), resultBuffer.size());
157 for (auto pp : _outputs) {
158 const auto offset_it = _outputInfo.offset.find(pp.first);
160 if (offset_it != _outputInfo.offset.end()) {
161 size_t resultOffset = checked_cast<size_t>(offset_it->second);
162 if (resultOffset > resultBuffer.size()) {
163 THROW_IE_EXCEPTION << "unexpected result data size";
166 auto outputBlob = pp.second;
167 auto outDesc = outputBlob->getTensorDesc();
169 // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
170 auto vpuLayout = (outDesc.getLayout() == NCHW || outDesc.getLayout() == NHWC) ? _deviceLayout : outDesc.getLayout();
171 ie::TensorDesc tempTensorDesc(outDesc.getPrecision(), outDesc.getDims(), vpuLayout);
172 auto tmpBlob = make_blob_with_precision(tempTensorDesc, resultBuffer.data() + resultOffset);
174 copyBlob(tmpBlob, outputBlob);
179 void MyriadInferRequest::GetPerformanceCounts(std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
180 auto perfInfo = _executor->getPerfTimeInfo(_graphDesc._graphHandle);
182 if (_log->level() >= LogLevel::Info) {
183 if (!perfInfo.empty()) {
184 _log->info("** Device execution time %f **", perfInfo[perfInfo.size()- 1]);
188 perfMap = vpu::parsePerformanceReport(
190 perfInfo.data(), perfInfo.size(),
191 _config->perfReport, _config->printReceiveTensorTime);