inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #define NOMINMAX
   6 #include <ie_blob.h>
   7 #include <ie_plugin.hpp>
   8 #include <description_buffer.hpp>
   9 #include <debug.h>
  10 #include <ie_layouts.h>
  11 #include <precision_utils.h>
  12
  13 #include <vpu/utils/perf_report.hpp>
  14 #include <vpu/utils/ie_helpers.hpp>
  15
  16 #include "myriad_executable_network.h"
  17 #include "myriad_infer_request.h"
  18
  19 using namespace vpu;
  20 using namespace vpu::MyriadPlugin;
  21 using namespace InferenceEngine;
  22
  23 #define MEMCPY(dst, src, bytes) std::copy_n((src), (bytes), (dst))
  24
  25 MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
  26                                         InferenceEngine::InputsDataMap networkInputs,
  27                                         InferenceEngine::OutputsDataMap networkOutputs,
  28                                         DataInfo& inputInfo,
  29                                         DataInfo& outputInfo,
  30                                         const std::vector<StageMetaInfo> &blobMetaData,
  31                                         const std::shared_ptr<MyriadConfig> &myriadConfig,
  32                                         const Logger::Ptr &log,
  33                                         const MyriadExecutorPtr &executor) :
  34         InferRequestInternal(networkInputs, networkOutputs), _executor(executor),
  35         _log(log), _stagesMetaData(blobMetaData), _config(myriadConfig),
  36         _inputInfo(inputInfo), _outputInfo(outputInfo),
  37         _graphDesc(graphDesc) {
  38     _deviceLayout = _config->compileConfig.hwOptimization ? NCHW : NHWC;
  39     if (_config->compileConfig.forceLayout == ComputeLayout::NCHW)
  40         _deviceLayout = NCHW;
  41     if (_config->compileConfig.forceLayout == ComputeLayout::NHWC)
  42         _deviceLayout = NHWC;
  43     // allocate inputs
  44     for (auto &networkInput : _networkInputs) {
  45         // TODO: use TensorDesc instead of deprecated methods
  46         SizeVector dims      = networkInput.second->getDims();
  47         Precision  precision = networkInput.second->getInputPrecision();
  48         Layout     layout    = networkInput.second->getTensorDesc().getLayout();
  49
  50         if (precision != Precision::FP32 &&
  51             precision != Precision::FP16 &&
  52             precision != Precision::U8) {
  53             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input precision: "
  54                                    << precision << "! Supported precisions: FP32, FP16 and U8";
  55         }
  56         Blob::Ptr inputBlob = make_blob_with_precision(precision, layout, dims);
  57
  58         // allocate the input blob
  59         // TODO We are allocating temporary input buffer of enough size. Wrap this buffer in blobs
  60         inputBlob->allocate();
  61         _inputs[networkInput.first] = inputBlob;
  62     }
  63     // allocate outputs
  64     for (auto &networkOutput : _networkOutputs) {
  65         SizeVector dims      = networkOutput.second->dims;
  66         Precision  precision = networkOutput.second->getPrecision();
  67         Layout     layout    = networkOutput.second->layout;
  68
  69         if (precision != Precision::FP32 &&
  70             precision != Precision::FP16) {
  71             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output precision: "
  72                                 << precision << "! Supported precisions: FP32, FP16";
  73         }
  74         Blob::Ptr outputBlob = make_blob_with_precision(precision, layout, dims);
  75         // allocate the output blob
  76         outputBlob->allocate();
  77         _outputs[networkOutput.first] = outputBlob;
  78     }
  79
  80     inputBuffer .resize(inputInfo.totalSize);
  81     resultBuffer.resize(outputInfo.totalSize);
  82
  83     if (_networkOutputs.empty() || _networkInputs.empty()) {
  84         THROW_IE_EXCEPTION << "Internal error: no information about network's output/input";
  85     }
  86 }
  87
  88 void MyriadInferRequest::InferImpl() {
  89     InferAsync();
  90     GetResult();
  91 }
  92
  93 void MyriadInferRequest::InferAsync() {
  94     for (auto input : _inputs) {
  95         auto const inputBlobPtr = input.second;
  96         if (inputBlobPtr->precision() != Precision::FP16
  97             && inputBlobPtr->precision() != Precision::FP32
  98             && inputBlobPtr->precision() != Precision::U8)
  99             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported input blob precision";
 100     }
 101     for (auto output : _outputs) {
 102         auto const outputBlobPtr = output.second;
 103         if (outputBlobPtr->precision() != Precision::FP16
 104             && outputBlobPtr->precision() != Precision::FP32)
 105             THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output blob precision";
 106     }
 107
 108     // execute input pre-processing
 109     execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
 110
 111     Blob::Ptr tmpBlob;
 112
 113     void* inputPtr = nullptr;
 114     size_t inputSize = _inputInfo.totalSize;
 115
 116     if (_inputs.size() > 1) {
 117         for (auto&& input : _inputs) {
 118             auto inputBlob = input.second;
 119             size_t byteSize = inputBlob->byteSize();
 120             Layout layout = inputBlob->getTensorDesc().getLayout();
 121             if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
 122                 // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
 123                 inputBlob = copyBlob(inputBlob, _deviceLayout);
 124             }
 125
 126             const auto input_offset_it = _inputInfo.offset.find(input.first);
 127             if (input_offset_it != _inputInfo.offset.end()) {
 128                 size_t required_buff_size = checked_cast<size_t>(input_offset_it->second) + byteSize;
 129                 IE_ASSERT(required_buff_size <= inputBuffer.size());
 130                 MEMCPY(&inputBuffer[input_offset_it->second], inputBlob->buffer().as<uint8_t*>(), byteSize);
 131             }
 132         }
 133
 134         inputPtr = inputBuffer.data();
 135     } else {
 136         auto dataName = _networkInputs.begin()->first;
 137         auto foundInputBlob = _inputs.find(dataName);
 138         if (foundInputBlob == _inputs.end())
 139             THROW_IE_EXCEPTION << "Error: input [" << dataName << "] is not provided.";
 140
 141         tmpBlob = foundInputBlob->second;
 142         Layout layout = tmpBlob->getTensorDesc().getLayout();
 143         if (layout != _deviceLayout && (layout == NCHW || layout == NHWC)) {
 144             // TODO copyBlob allocates new memory, but we already have allocated buffer of enough size
 145             tmpBlob = copyBlob(tmpBlob, _deviceLayout);
 146         }
 147
 148         inputPtr = tmpBlob->buffer();
 149     }
 150
 151     _executor->queueInference(_graphDesc, inputPtr, inputSize, nullptr, 0);
 152 }
 153
 154 void MyriadInferRequest::GetResult() {
 155     _executor->getResult(_graphDesc, resultBuffer.data(), resultBuffer.size());
 156
 157     for (auto pp : _outputs) {
 158         const auto offset_it = _outputInfo.offset.find(pp.first);
 159
 160         if (offset_it !=  _outputInfo.offset.end()) {
 161             size_t resultOffset = checked_cast<size_t>(offset_it->second);
 162             if (resultOffset > resultBuffer.size()) {
 163                 THROW_IE_EXCEPTION << "unexpected result data size";
 164             }
 165
 166             auto outputBlob = pp.second;
 167             auto outDesc = outputBlob->getTensorDesc();
 168
 169             // TODO: TensorDesc doesn't update internal BlockingDesc and strides when setLayout is called
 170             auto vpuLayout = (outDesc.getLayout() == NCHW || outDesc.getLayout() == NHWC) ? _deviceLayout : outDesc.getLayout();
 171             ie::TensorDesc tempTensorDesc(outDesc.getPrecision(), outDesc.getDims(), vpuLayout);
 172             auto tmpBlob = make_blob_with_precision(tempTensorDesc, resultBuffer.data() + resultOffset);
 173
 174             copyBlob(tmpBlob, outputBlob);
 175         }
 176     }
 177 }
 178
 179 void MyriadInferRequest::GetPerformanceCounts(std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
 180     auto perfInfo = _executor->getPerfTimeInfo(_graphDesc._graphHandle);
 181
 182     if (_log->level() >= LogLevel::Info) {
 183         if (!perfInfo.empty()) {
 184             _log->info("** Device execution time %f **", perfInfo[perfInfo.size()- 1]);
 185         }
 186     }
 187
 188     perfMap = vpu::parsePerformanceReport(
 189         _stagesMetaData,
 190         perfInfo.data(), perfInfo.size(),
 191         _config->perfReport, _config->printReceiveTensorTime);
 192 }