Publishing 2019 R1.1 content and Myriad plugin sources (#162)
[platform/upstream/dldt.git] / inference-engine / src / vpu / myriad_plugin / myriad_infer_request.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #define NOMINMAX
6 #include <ie_blob.h>
7 #include <ie_plugin.hpp>
8 #include <description_buffer.hpp>
9 #include <debug.h>
10 #include <ie_layouts.h>
11 #include <precision_utils.h>
12
13 #include <vpu/utils/perf_report.hpp>
14 #include <vpu/utils/ie_helpers.hpp>
15
16 #include "myriad_executable_network.h"
17 #include "myriad_infer_request.h"
18
19 using namespace vpu;
20 using namespace vpu::MyriadPlugin;
21 using namespace InferenceEngine;
22
23 #define MEMCPY(dst, src, bytes) std::copy_n((src), (bytes), (dst))
24
25 MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
26                                         InferenceEngine::InputsDataMap networkInputs,
27                                         InferenceEngine::OutputsDataMap networkOutputs,
28                                         DataInfo& inputInfo,
29                                         DataInfo& outputInfo,
30                                         const std::vector<StageMetaInfo> &blobMetaData,
31                                         const std::shared_ptr<MyriadConfig> &myriadConfig,
32                                         const Logger::Ptr &log,
33                                         const MyriadExecutorPtr &executor) :
34         InferRequestInternal(networkInputs, networkOutputs), _executor(executor),
35         _log(log), _stagesMetaData(blobMetaData), _config(myriadConfig),
36         _inputInfo(inputInfo), _outputInfo(outputInfo),
37         _graphDesc(graphDesc) {
38     _deviceLayout = _config->compileConfig.hwOptimization ? NCHW : NHWC;
39     if (_config->compileConfig.forceLayout == ComputeLayout::NCHW)
40         _deviceLayout = NCHW;
41     if (_config->compileConfig.forceLayout == ComputeLayout::NHWC)
42         _deviceLayout = NHWC;
43     // allocate inputs
44     for (auto &networkInput : _networkInputs) {
45         // TODO: use TensorDesc instead of deprecated methods
46         SizeVector dims      = networkInput.second->getDims();
47         Precision  precision = networkInput.second->getInputPrecision();
48         Layout     layout    = networkInput.second->getTensorDesc().getLayout();
49
50         if (precision != Precision::FP32 &&
51             precision != Precision::FP16 &&
52             precision != Precision::U8) {
53             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input precision: "
54                                    << precision << "! Supported precisions: FP32, FP16 and U8";
55         }
56         Blob::Ptr inputBlob = make_blob_with_precision(precision, layout, dims);
57
58         // allocate the input blob
59         // TODO We are allocating temporary input buffer of enough size. Wrap this buffer in blobs
60         inputBlob->allocate();
61         _inputs[networkInput.first] = inputBlob;
62     }
63     // allocate outputs
64     for (auto &networkOutput : _networkOutputs) {
65         SizeVector dims      = networkOutput.second->dims;
66         Precision  precision = networkOutput.second->getPrecision();
67         Layout     layout    = networkOutput.second->layout;
68
69         if (precision != Precision::FP32 &&
70             precision != Precision::FP16) {
71             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output precision: "
72                                 << precision << "! Supported precisions: FP32, FP16";
73         }
74         Blob::Ptr outputBlob = make_blob_with_precision(precision, layout, dims);
75         // allocate the output blob
76         outputBlob->allocate();
77         _outputs[networkOutput.first] = outputBlob;
78     }
79
80     inputBuffer .resize(inputInfo.totalSize);
81     resultBuffer.resize(outputInfo.totalSize);
82
83     if (_networkOutputs.empty() || _networkInputs.empty()) {
84         THROW_IE_EXCEPTION << "Internal error: no information about network's output/input";
85     }
86 }
87
88 void MyriadInferRequest::InferImpl() {
89     InferAsync();
90     GetResult();
91 }
92
93 void MyriadInferRequest::InferAsync() {
94     for (auto input : _inputs) {
95         auto const inputBlobPtr = input.second;
96         if (inputBlobPtr->precision() != Precision::FP16
97             && inputBlobPtr->precision() != Precision::FP32
98             && inputBlobPtr->precision() != Precision::U8)
99             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input blob precision";
100     }
101     for (auto output : _outputs) {
102         auto const outputBlobPtr = output.second;
103         if (outputBlobPtr->precision() != Precision::FP16
104             && outputBlobPtr->precision() != Precision::FP32)
105             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output blob precision";
106     }
107
108     // execute input pre-processing
109     execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
110
111     Blob::Ptr tmpBlob;
112
113     void* inputPtr = nullptr;
114     size_t inputSize = _inputInfo.totalSize;
115
116     if (_inputs.size() > 1) {
117         for (auto&& input : _inputs) {
118             auto inputBlob = input.second;
119             size_t byteSize = inputBlob->byteSize();
120             Layout layout = inputBlob->getTensorDesc().getLayout();
121             if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
122                 // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
123                 inputBlob = copyBlob(inputBlob, _deviceLayout);
124             }
125
126             const auto input_offset_it = _inputInfo.offset.find(input.first);
127             if (input_offset_it != _inputInfo.offset.end()) {
128                 size_t required_buff_size = checked_cast<size_t>(input_offset_it->second) + byteSize;
129                 IE_ASSERT(required_buff_size <= inputBuffer.size());
130                 MEMCPY(&inputBuffer[input_offset_it->second], inputBlob->buffer().as<uint8_t*>(), byteSize);
131             }
132         }
133
134         inputPtr = inputBuffer.data();
135     } else {
136         auto dataName = _networkInputs.begin()->first;
137         auto foundInputBlob = _inputs.find(dataName);
138         if (foundInputBlob == _inputs.end())
139             THROW_IE_EXCEPTION << "Error: input [" << dataName << "] is not provided.";
140
141         tmpBlob = foundInputBlob->second;
142         Layout layout = tmpBlob->getTensorDesc().getLayout();
143         if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
144             // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
145             tmpBlob = copyBlob(tmpBlob, _deviceLayout);
146         }
147
148         inputPtr = tmpBlob->buffer();
149     }
150
151     _executor->queueInference(_graphDesc, inputPtr, inputSize, nullptr, 0);
152 }
153
154 void MyriadInferRequest::GetResult() {
155     _executor->getResult(_graphDesc, resultBuffer.data(), resultBuffer.size());
156
157     for (auto pp : _outputs) {
158         const auto offset_it = _outputInfo.offset.find(pp.first);
159
160         if (offset_it !=  _outputInfo.offset.end()) {
161             size_t resultOffset = checked_cast<size_t>(offset_it->second);
162             if (resultOffset > resultBuffer.size()) {
163                 THROW_IE_EXCEPTION << "unexpected result data size";
164             }
165
166             auto outputBlob = pp.second;
167             auto outDesc = outputBlob->getTensorDesc();
168
169             // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
170             auto vpuLayout = (outDesc.getLayout() == NCHW || outDesc.getLayout() == NHWC) ? _deviceLayout : outDesc.getLayout();
171             ie::TensorDesc tempTensorDesc(outDesc.getPrecision(), outDesc.getDims(), vpuLayout);
172             auto tmpBlob = make_blob_with_precision(tempTensorDesc, resultBuffer.data() + resultOffset);
173
174             copyBlob(tmpBlob, outputBlob);
175         }
176     }
177 }
178
179 void MyriadInferRequest::GetPerformanceCounts(std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
180     auto perfInfo = _executor->getPerfTimeInfo(_graphDesc._graphHandle);
181
182     if (_log->level() >= LogLevel::Info) {
183         if (!perfInfo.empty()) {
184             _log->info("** Device execution time %f **", perfInfo[perfInfo.size()- 1]);
185         }
186     }
187
188     perfMap = vpu::parsePerformanceReport(
189         _stagesMetaData,
190         perfInfo.data(), perfInfo.size(),
191         _config->perfReport, _config->printReceiveTensorTime);
192 }