1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
9 #include <CPP/detection_output.hpp> // todo: find a way to remove this
10 #include <description_buffer.hpp>
11 #include "cldnn_infer_request.h"
13 using namespace InferenceEngine;
15 namespace CLDNNPlugin {
17 const char CLDNNInferRequest::fp32_suffix[] = "_fp32";
19 Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
20 const Layout l = desc.getLayout();
21 const Precision p = desc.getPrecision();
22 const SizeVector sz = SizeVector(desc.getDims().rbegin(), desc.getDims().rend());
26 if (mem_ptr != nullptr)
27 return make_shared_blob<float>(p, l, sz, reinterpret_cast<float*>(mem_ptr));
29 return make_shared_blob<float, const SizeVector>(p, l, sz);
31 if (mem_ptr != nullptr)
32 return make_shared_blob<uint16_t>(p, l, sz, reinterpret_cast<uint16_t*>(mem_ptr));
34 return make_shared_blob<uint16_t, const SizeVector>(p, l, sz);
36 if (mem_ptr != nullptr)
37 return make_shared_blob<int16_t>(p, l, sz, reinterpret_cast<int16_t*>(mem_ptr));
39 return make_shared_blob<int16_t, const SizeVector>(p, l, sz);
41 if (mem_ptr != nullptr)
42 return make_shared_blob<uint8_t>(p, l, sz, reinterpret_cast<uint8_t*>(mem_ptr));
44 return make_shared_blob<uint8_t, const SizeVector>(Precision::U8, l, sz);
46 THROW_IE_EXCEPTION << "The plugin does not support input " << p.name() << " precision";
50 Blob::Ptr CLDNNInferRequest::createOutputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
51 const Layout l = desc.getLayout();
52 const Precision p = desc.getPrecision();
53 const SizeVector sz = SizeVector(desc.getDims().rbegin(), desc.getDims().rend());
57 if (mem_ptr != nullptr)
58 return make_shared_blob<float>(p, l, sz, reinterpret_cast<float*>(mem_ptr));
60 return make_shared_blob<float, const SizeVector>(p, l, sz);
62 if (mem_ptr != nullptr)
63 return make_shared_blob<uint16_t>(p, l, sz, reinterpret_cast<uint16_t*>(mem_ptr));
65 return make_shared_blob<uint16_t, const SizeVector>(p, l, sz);
67 THROW_IE_EXCEPTION << "The plugin does not support output " << p.name() << " precision";
71 void CLDNNInferRequest::copyOutputData(const cldnn::memory& outputMemory,
74 size_t n = (bi == nullptr) ? bptr->size() : bi->buf_size;
75 size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;
77 auto layout = outputMemory.get_layout();
78 auto size = layout.size;
79 auto l_padd = layout.data_padding.lower_size();
80 auto u_padd = layout.data_padding.upper_size();
82 auto h_padding = u_padd.spatial[0] + l_padd.spatial[0];
83 auto v_padding_l = (h_padding + size.spatial[0]) * u_padd.spatial[1];
84 auto v_padding_u = (h_padding + size.spatial[0]) * l_padd.spatial[1];
86 switch (bptr->precision()) {
87 case Precision::FP32: {
88 TBlob<float>::Ptr out_f = std::dynamic_pointer_cast<TBlob<float>>(bptr);
89 if (out_f == nullptr) {
90 THROW_IE_EXCEPTION << "Invalid output blob";
92 auto resPtr = outputMemory.pointer<float>();
93 float *resVec = out_f->data() + offset;
95 if (h_padding || v_padding_l || v_padding_u) {
97 for (size_t b = 0; b < size.batch[0]; b++) {
98 for (size_t f = 0; f < size.feature[0]; f++) {
100 for (size_t y = 0; y < size.spatial[1]; y++) {
101 i += l_padd.spatial[0];
102 for (size_t x = 0; x < size.spatial[0]; x++, i++) {
103 *resVec++ = resPtr[i];
105 i += u_padd.spatial[0];
111 for (size_t i = 0; i < n; i++) {
112 resVec[i] = resPtr[i];
117 case Precision::FP16: {
118 TBlob<uint16_t>::Ptr out_f = std::dynamic_pointer_cast<TBlob<uint16_t>>(bptr);
119 if (out_f == nullptr) {
120 THROW_IE_EXCEPTION << "Invalid output blob";
122 auto resPtr = outputMemory.pointer<uint16_t>();
123 uint16_t *resVec = out_f->data() + offset;
125 if (h_padding || v_padding_l || v_padding_u) {
127 for (size_t b = 0; b < size.batch[0]; b++) {
128 for (size_t f = 0; f < size.feature[0]; f++) {
130 for (size_t y = 0; y < size.spatial[1]; y++) {
131 i += l_padd.spatial[0];
132 for (size_t x = 0; x < size.spatial[0]; x++, i++) {
133 *resVec++ = resPtr[i];
135 i += u_padd.spatial[0];
141 for (size_t i = 0; i < n; i++) {
142 resVec[i] = resPtr[i];
148 THROW_IE_EXCEPTION << "The plugin does not support output " << bptr->precision() << " precision";
152 void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
153 const cldnn::primitive_id &inputName,
154 const cldnn::layout& inputLayout,
155 const Blob &inputBlob, buf_info* bi) {
156 size_t n = (bi == nullptr) ? inputBlob.size() : bi->buf_size;
157 size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;
159 cldnn::primitive_id internalName = "Input:" + inputName;
160 switch (inputBlob.precision()) {
161 case Precision::FP32: {
162 float* blob_ptr = const_cast<float*>(inputBlob.cbuffer().as<const float*>()) + offset;
163 network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
166 case Precision::FP16: {
167 uint16_t* blob_ptr = const_cast<uint16_t*>(inputBlob.cbuffer().as<const uint16_t*>()) + offset;
168 network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
171 case Precision::U8: {
172 uint8_t* blob_ptr = const_cast<uint8_t*>(inputBlob.cbuffer().as<const uint8_t*>()) + offset;
173 network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
177 THROW_IE_EXCEPTION << "The plugin does not support input " << inputBlob.precision() << " precision";
181 void CLDNNInferRequest::AllocateInputs() {
183 for (auto &input : m_env.inputLayouts) {
184 std::string name = input.first;
185 cldnn::layout layout = input.second;
187 InputInfo::Ptr ni = _networkInputs.at(input.first);
188 const TensorDesc& desc = ni->getTensorDesc();
190 cldnn::memory inputMem = cldnn::memory::allocate(*(m_env.engine), layout);
191 cldnn::pointer<uint8_t> mem_ptr = inputMem.pointer<uint8_t>();
193 inputsMemory.insert({ name, inputMem });
194 _inputs[name] = createInputBlob(desc, mem_ptr.data());
196 if (desc.getPrecision() == Precision::I16) {
197 cldnn::layout layout_fp32 = layout;
198 layout_fp32.data_type = cldnn::data_types::f32;
199 cldnn::memory inputMem_fp32 = cldnn::memory::allocate(*(m_env.engine), layout_fp32);
200 inputsMemory.insert({ input.first + fp32_suffix, inputMem_fp32 });
205 void CLDNNInferRequest::AllocateInputsDyn() {
207 for (auto &input : m_env.inputLayouts) {
208 InputInfo::Ptr ni = _networkInputs.at(input.first);
209 TensorDesc desc = ni->getTensorDesc();
210 SizeVector& dims = desc.getDims();
213 *dims.begin() = static_cast<size_t>(m_env.m_max_batch);
215 THROW_IE_EXCEPTION << "Empty dimensions for input blob " << input.first;
218 Blob::Ptr inputBlob = createInputBlob(desc);
219 if (desc.getPrecision() == Precision::I16) {
220 auto fp32inputBlob = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(Precision::FP32,
223 fp32inputBlob->allocate();
224 _inputs[input.first + fp32_suffix] = fp32inputBlob;
226 inputBlob->allocate();
227 _inputs[input.first] = inputBlob;
231 void CLDNNInferRequest::AllocateOutputs() {
232 auto networkOutputsIDs = m_env.network->get_output_ids();
233 auto allPrimitiveIds = m_env.network->get_all_primitives();
236 for (auto& no : _networkOutputs) {
237 // Find correct output ID. Start with name stored in IR.
238 std::string outputID = m_env.primitiveIDs.at(no.first);
239 while (std::find(networkOutputsIDs.begin(), networkOutputsIDs.end(), outputID) == networkOutputsIDs.end()) {
240 // If current ID isn't found in cldnn network outputs, get previous primitive id and try again.
241 auto prim = allPrimitiveIds.find(outputID);
242 if (prim == allPrimitiveIds.end()) {
243 THROW_IE_EXCEPTION << "Unknown primitive id " << outputID;
246 if (m_env.prevPrimitiveIDs.at(outputID).size() != 1 || prim->second != "_optimized_") {
247 THROW_IE_EXCEPTION << "Unable to find parent for output primitive " << outputID;
249 outputID = m_env.prevPrimitiveIDs.at(outputID)[0];
252 cldnn::memory output_mem = m_env.network->get_output_memory(outputID);
253 cldnn::pointer<uint8_t> output_mem_ptr = output_mem.pointer<uint8_t>();
254 if (output_mem_ptr.data() == nullptr) {
255 THROW_IE_EXCEPTION << "Empty output memory for primitive " << outputID;
258 DataPtr oi = no.second;
259 const TensorDesc& desc = oi->getTensorDesc();
261 _outputs[no.first] = createOutputBlob(desc, output_mem_ptr.data());
262 outputsMap[no.first] = outputID;
266 void CLDNNInferRequest::AllocateOutputsDyn() {
268 for (auto& no : _networkOutputs) {
269 DataPtr oi = no.second;
270 TensorDesc desc = oi->getTensorDesc();
271 SizeVector& dims = desc.getDims();
274 *dims.begin() = static_cast<size_t>(m_env.m_max_batch);
276 THROW_IE_EXCEPTION << "Empty dimensions for output blob " << no.first;
279 Blob::Ptr outputBlob = createOutputBlob(desc);
280 outputBlob->allocate();
281 _outputs[no.first] = outputBlob;
285 void CLDNNInferRequest::SetBatch(int new_batch) {
286 if (m_env.m_max_batch < 0)
287 THROW_IE_EXCEPTION << "Dynamic batch is not enabled.";
289 if (new_batch < 1 || new_batch > m_env.m_max_batch) {
290 THROW_IE_EXCEPTION << "Invalid dynamic batch size " << new_batch <<
291 " for this request.";
294 if (new_batch == m_curBatch)
298 batchOutputs.clear();
300 // tune expected inputs
301 for (auto &input : m_env.inputLayouts) {
302 cldnn::tensor dims = input.second.size;
303 const SizeVector sz = { size_t(dims.spatial[0]), size_t(dims.spatial[1]), size_t(dims.feature[0]), 1 };
304 size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies<size_t>());
305 std::vector<buf_info> in_buf;
308 size_t bsz = single_batch;
311 // calculate metadata for input buffers
312 for (unsigned nb = 0; nb < m_env.m_bv_sz; nb++) {
313 unsigned int mask = 1 << nb;
315 buf_info ib = { offset, bsz };
316 in_buf.push_back(ib);
318 if (new_batch & mask)
323 batchInputs[input.first] = in_buf;
326 // tune expected outputs
327 for (auto& no : _networkOutputs) {
328 auto res_output = m_env.outputDims.find(no.first);
330 InferenceEngine::SizeVector sz;
331 if (res_output != m_env.outputDims.end())
332 sz = res_output->second;
334 sz = m_env.outputDims.at(m_env.primitiveIDs.at(no.first));
337 size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies<size_t>());
338 std::vector<buf_info> out_buf;
341 size_t bsz = single_batch;
343 // calculate metadata for output buffers
344 for (unsigned nb = 0; nb < m_env.m_bv_sz; nb++) {
345 unsigned int mask = 1 << nb;
347 buf_info ob = { offset, bsz };
348 out_buf.push_back(ob);
350 if (new_batch & mask)
356 batchOutputs[no.first] = out_buf;
359 m_curBatch = new_batch;
362 CLDNNInferRequest::CLDNNInferRequest(InferenceEnv env, bool useProfiling,
363 InputsDataMap networkInputs, OutputsDataMap networkOutputs)
364 : InferRequestInternal(networkInputs, networkOutputs),
366 m_useProfiling(useProfiling) {
367 if (m_env.m_max_batch > 1) {
368 SetBatch(m_env.m_max_batch);
370 AllocateOutputsDyn();
376 // Fill implementations map
377 if (m_useProfiling) {
378 auto extractImplementationFromInfo = [](const std::string& info) -> std::string {
379 std::string def_implementation = "undef";
380 std::string impl_section = "implementation :";
381 std::string::size_type pos = info.find(impl_section);
382 if (pos == std::string::npos) {
383 return def_implementation;
386 std::string::size_type end_pos = info.find(',', pos);
387 if (end_pos == std::string::npos) {
388 return def_implementation;
391 std::string::size_type length = end_pos - pos - impl_section.size();
393 auto trim = [](const std::string& str) {
394 size_t first = str.find_first_not_of(' ');
395 if (std::string::npos == first) {
398 size_t last = str.find_last_not_of(' ');
399 return str.substr(first, (last - first + 1));
401 std::string tmp = trim(info.substr(pos + impl_section.size(), length));
403 return tmp.length() > 1 ? tmp : def_implementation;
406 // Parse primitive info and extract implementation name.
407 for (auto& id : m_env.profilingIDs) {
408 std::string prim_info = "";
410 prim_info = m_env.network->get_primitive_info(id);
411 } catch (std::exception& e) { }
413 implementationsMap.insert({id, extractImplementationFromInfo(prim_info)});
418 void CLDNNInferRequest::execAndParse() {
419 auto networkOutputs = m_env.network->execute();
421 // Collect outputs as requested by the model
422 for (auto& no : _networkOutputs) {
423 std::string outputID = outputsMap[no.first];
424 auto outputMemory = networkOutputs.at(outputID).get_memory();
425 Blob::Ptr bptr = _outputs[no.first];
427 auto out_ptr = outputMemory.pointer<uint8_t>();
428 auto blob_ptr = bptr->buffer().as<uint8_t*>();
430 // If Async API is used, copy of output blobs is not needed, unless SetBlob function was called.
431 // But in the case when old API is used we have to copy data to memory provided by user.
432 if (blob_ptr != &out_ptr[0]) {
433 copyOutputData(outputMemory, bptr);
437 // finally collect profiling info
438 if (m_useProfiling) {
439 std::map<cldnn::primitive_id, cldnn::event> executedPrimitives = m_env.network->get_executed_primitives();
440 auto allPrimitives = m_env.network->get_all_primitives();
442 // Get profiling info for all layers
443 for (auto &profiledID : m_env.profilingIDs) {
444 auto& perfCount = m_env.perfMap[profiledID].second;
445 // Change status if layer wasn't executed by cldnn engine
446 if (perfCount.num == 0 &&
447 executedPrimitives.find(profiledID) == executedPrimitives.end()) {
448 if (allPrimitives.find(profiledID) != allPrimitives.end() &&
449 allPrimitives.at(profiledID) == "_optimized_") {
450 // Layer was marked as optimized by cldnn
451 perfCount.status = InferenceEngineProfileInfo::OPTIMIZED_OUT;
453 // Layer wasn't run for some reason
454 perfCount.status = InferenceEngineProfileInfo::NOT_RUN;
459 auto event = executedPrimitives.at(profiledID);
460 executedPrimitives.erase(profiledID);
462 cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event.get_profiling_info()};
465 for (auto &interval : cldnnInfo.intervals) {
466 using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
467 auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
469 if (interval.name == "submission") {
470 perfCount.cpu_uSec += count;
471 } else if (interval.name == "executing") {
472 perfCount.realTime_uSec += count;
473 } else if (interval.name == "duration") { // "duration" is used for CPU layers
474 perfCount.cpu_uSec += count;
476 if (perfCount.num == 0)
477 perfCount.isCPU = true;
485 void CLDNNInferRequest::execAndParseDyn() {
486 std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> networkOutputs(m_env.m_bv_sz);
488 // set up exection and put all graphs into driver queue
489 for (unsigned nb = 0; nb < m_env.m_bv_sz; nb++) {
490 unsigned int mask = 1 << nb;
492 if (m_curBatch & mask) {
493 networkOutputs[nb] = m_env.batchNetworks[nb]->execute();
497 // now try to get execution results
498 for (unsigned nb = 0; nb < m_env.m_bv_sz; nb++) {
499 unsigned int mask = 1 << nb;
501 if (m_curBatch & mask) {
502 for (auto& no : _networkOutputs) {
503 std::string outputID = no.first;
504 while ((m_env.primitiveIDs.find(outputID) != m_env.primitiveIDs.end()) &&
505 (m_env.primitiveIDs.at(outputID) != outputID)) {
506 outputID = m_env.primitiveIDs.at(outputID);
509 auto outputMemory = networkOutputs[nb].at(outputID).get_memory();
510 Blob::Ptr bptr = _outputs[no.first];
512 copyOutputData(outputMemory, bptr, &batchOutputs[no.first][nb]);
518 void CLDNNInferRequest::InferImpl() {
519 IE_PROFILING_AUTO_SCOPE(CLDNN_INFER)
521 // execute input pre-processing.
522 execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
524 for (auto &item : _inputs) {
525 if (m_env.m_max_batch > 1) {
526 PrepareInputDyn(item.first, *item.second);
528 PrepareInput(item.first, *item.second);
532 // The actual inference
533 if (m_env.m_max_batch > 1) {
540 void CLDNNInferRequest::GetPerformanceCounts(
541 std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
542 if (!m_useProfiling) {
543 THROW_IE_EXCEPTION << "Performance counters were not enabled";
546 for (auto& profiledID : m_env.profilingIDs) {
547 const auto& layerName = m_env.perfMap.at(profiledID).first;
548 if (layerName.length() == 0) // no layer directly associated
551 const auto& perfCounter = m_env.perfMap.at(profiledID).second;
552 auto& extPerfEntry = perfMap[layerName];
554 // copy layer implementation
555 if (perfCounter.isCPU) {
556 static const std::string cpuExecType("CPU");
557 memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type));
558 cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length()); // Override execType as CPU
560 std::string impl = implementationsMap.at(profiledID);
561 impl.copy(extPerfEntry.exec_type, impl.length());
564 extPerfEntry.execution_index = i++;
565 extPerfEntry.status = perfCounter.status;
566 extPerfEntry.cpu_uSec = perfCounter.cpu_avg();
567 extPerfEntry.realTime_uSec = perfCounter.realTime_avg();
569 perfCounter.layerType.copy(extPerfEntry.layer_type, perfCounter.layerType.length());
574 void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const Blob &inputBlob) {
576 if (m_env.inputLayouts.find(inputName) == m_env.inputLayouts.end()) {
577 THROW_IE_EXCEPTION << "Input name mismatch.";
579 auto inputLayout = m_env.inputLayouts.at(inputName);
580 auto is_same_buffer = [](const Blob& blob, const cldnn::memory& memory) -> bool {
581 const std::string str_not_allocated("Input data was not allocated.");
582 cldnn::pointer<const uint8_t> ptr = memory.pointer<const uint8_t>();
583 const uint8_t* blob_ptr = blob.cbuffer().as<const uint8_t*>();
584 const uint8_t* mem_ptr = ptr.data();
585 if (blob_ptr == nullptr || mem_ptr == nullptr) {
586 THROW_IE_EXCEPTION << str_not_allocated;
588 return (blob_ptr == mem_ptr) && (blob.byteSize() == memory.size());
591 cldnn::primitive_id internalName = "Input:" + inputName;
592 const cldnn::memory& memory = inputsMemory.at(inputName);
593 if (inputBlob.precision() == Precision::I16) {
594 // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision
595 const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix);
596 cldnn::pointer<float> ptr = fp32_mem.pointer<float>();
597 InferenceEngine::copyToFloat<int16_t>(ptr.data(), &inputBlob);
598 m_env.network->set_input_data(internalName, fp32_mem);
599 } else if (is_same_buffer(inputBlob, memory)) {
600 // If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data.
601 switch (inputBlob.precision()) {
602 case Precision::FP32:
603 case Precision::FP16:
604 case Precision::U8: {
605 m_env.network->set_input_data(internalName, memory);
609 THROW_IE_EXCEPTION << "Unsupported input precision " << inputBlob.precision();
612 // Otherwise, we have to attach to user memory and then copy the data.
613 copyInputData(m_env.network, inputName, inputLayout, inputBlob);
617 void CLDNNInferRequest::PrepareInputDyn(const cldnn::primitive_id &inputName, const Blob &inputBlob) {
618 // now try to get execution results
619 for (unsigned nb = 0; nb < m_env.m_bv_sz; nb++) {
620 unsigned int mask = 1 << nb;
622 if (m_curBatch & mask) {
623 auto inputLayout = m_env.inputLayouts.at(inputName);
624 inputLayout.size.batch[0] = mask;
625 copyInputData(m_env.batchNetworks[nb], inputName, inputLayout, inputBlob, &batchInputs[inputName][nb]);
630 }; // namespace CLDNNPlugin