2 * Copyright (c) 2019 Samsung Electronics Co., Ltd All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "mv_private.h"
18 #include "Inference.h"
19 #include "InferenceIni.h"
20 #include "ObjectDecoder.h"
21 #include "PoseDecoder.h"
32 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
33 #define MV_INFERENCE_OUTPUT_NUMBERS_MIN 1
34 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
35 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
47 using namespace mediavision::common::util;
48 using namespace mediavision::machine_learning;
54 InferenceConfig::InferenceConfig()
58 , mDataType(MV_INFERENCE_DATA_FLOAT32)
59 , mTargetTypes(MV_INFERENCE_TARGET_DEVICE_CPU)
60 , mConfidenceThresHold()
63 , mMaxOutputNumbers(1)
65 mTensorInfo.width = -1;
66 mTensorInfo.height = -1;
71 Inference::Inference()
75 CheckSupportedInferenceBackend();
77 for (auto &backend : mSupportedInferenceBackend) {
78 LOGI("%s: %s", backend.second.first.c_str(), backend.second.second ? "TRUE" : "FALSE");
83 Inference::~Inference()
85 CleanupTensorBuffers();
87 if (!mInputLayerProperty.layers.empty()) {
88 mInputLayerProperty.layers.clear();
89 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
91 if (!mOutputLayerProperty.layers.empty()) {
92 mOutputLayerProperty.layers.clear();
93 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
96 mModelFormats.clear();
98 // Release backend engine.
100 mBackend->UnbindBackend();
104 LOGI("Released backend engine.");
107 void Inference::CheckSupportedInferenceBackend()
114 std::vector<int> supportedBackend = ini.GetSupportedInferenceEngines();
115 for (auto &backend : supportedBackend) {
116 LOGI("engine: %d", backend);
118 mSupportedInferenceBackend[backend].second = true;
124 int Inference::ConvertEngineErrorToVisionError(int error)
126 int ret = MEDIA_VISION_ERROR_NONE;
129 case INFERENCE_ENGINE_ERROR_NONE:
130 ret = MEDIA_VISION_ERROR_NONE;
132 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED:
133 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED;
135 case INFERENCE_ENGINE_ERROR_MSG_TOO_LONG:
136 ret = MEDIA_VISION_ERROR_MSG_TOO_LONG;
138 case INFERENCE_ENGINE_ERROR_NO_DATA:
139 ret = MEDIA_VISION_ERROR_NO_DATA;
141 case INFERENCE_ENGINE_ERROR_KEY_NOT_AVAILABLE:
142 ret = MEDIA_VISION_ERROR_KEY_NOT_AVAILABLE;
144 case INFERENCE_ENGINE_ERROR_OUT_OF_MEMORY:
145 ret = MEDIA_VISION_ERROR_OUT_OF_MEMORY;
147 case INFERENCE_ENGINE_ERROR_INVALID_PARAMETER:
148 ret = MEDIA_VISION_ERROR_INVALID_PARAMETER;
150 case INFERENCE_ENGINE_ERROR_INVALID_OPERATION:
151 ret = MEDIA_VISION_ERROR_INVALID_OPERATION;
153 case INFERENCE_ENGINE_ERROR_PERMISSION_DENIED:
154 ret = MEDIA_VISION_ERROR_PERMISSION_DENIED;
156 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED_FORMAT:
157 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
159 case INFERENCE_ENGINE_ERROR_INTERNAL:
160 ret = MEDIA_VISION_ERROR_INTERNAL;
162 case INFERENCE_ENGINE_ERROR_INVALID_DATA:
163 ret = MEDIA_VISION_ERROR_INVALID_DATA;
165 case INFERENCE_ENGINE_ERROR_INVALID_PATH:
166 ret = MEDIA_VISION_ERROR_INVALID_PATH;
169 LOGE("Unknown inference engine error type");
175 int Inference::ConvertTargetTypes(int given_types)
177 int target_types = INFERENCE_TARGET_NONE;
179 if (given_types & MV_INFERENCE_TARGET_DEVICE_CPU)
180 target_types |= INFERENCE_TARGET_CPU;
181 if (given_types & MV_INFERENCE_TARGET_DEVICE_GPU)
182 target_types |= INFERENCE_TARGET_GPU;
183 if (given_types & MV_INFERENCE_TARGET_DEVICE_CUSTOM)
184 target_types |= INFERENCE_TARGET_CUSTOM;
189 int Inference::ConvertToCv(int given_type)
192 const int ch = mConfig.mTensorInfo.ch;
194 switch (given_type) {
195 case INFERENCE_TENSOR_DATA_TYPE_UINT8:
196 LOGI("Type is %d ch with UINT8", ch);
197 type = ch == 1 ? CV_8UC1 : CV_8UC3;
199 case INFERENCE_TENSOR_DATA_TYPE_FLOAT32:
200 LOGI("Type is %d ch with FLOAT32", ch);
201 type = ch == 1 ? CV_32FC1 : CV_32FC3;
204 LOGI("unknown data type so FLOAT32 data type will be used in default");
205 type = ch == 1 ? CV_32FC1 : CV_32FC3;
212 inference_tensor_data_type_e Inference::ConvertToIE(int given_type)
214 inference_tensor_data_type_e type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
216 switch (given_type) {
217 case MV_INFERENCE_DATA_FLOAT32:
218 type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
220 case MV_INFERENCE_DATA_UINT8:
221 type = INFERENCE_TENSOR_DATA_TYPE_UINT8;
224 LOGI("unknown data type so FLOAT32 data type will be used in default");
231 int Inference::SetUserFile(std::string filename)
233 std::ifstream fp(filename.c_str());
235 return MEDIA_VISION_ERROR_INVALID_PATH;
238 std::string userListName;
240 std::getline(fp, userListName);
241 if (userListName.length())
242 mUserListName.push_back(userListName);
247 return MEDIA_VISION_ERROR_NONE;
250 void Inference::ConfigureModelFiles(const std::string modelConfigFilePath, const std::string modelWeightFilePath,
251 const std::string modelUserFilePath)
255 mConfig.mConfigFilePath = modelConfigFilePath;
256 mConfig.mWeightFilePath = modelWeightFilePath;
257 mConfig.mUserFilePath = modelUserFilePath;
262 int Inference::ConfigureInputInfo(int width, int height, int dim, int ch, double stdValue, double meanValue,
263 int dataType, const std::vector<std::string> names)
267 // FIXME: mConfig should be removed
268 mConfig.mTensorInfo = { width, height, dim, ch };
269 mConfig.mStdValue = stdValue;
270 mConfig.mMeanValue = meanValue;
271 mConfig.mDataType = static_cast<mv_inference_data_type_e>(dataType);
272 mConfig.mInputLayerNames = names;
274 int ret = setInputInfo();
281 int Inference::configureInputMetaInfo()
285 LOGI("use input meta");
287 auto &layerInfo = mMetadata.GetInputMeta().GetLayer().begin()->second;
289 if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NCHW) { // NCHW
290 mConfig.mTensorInfo.ch = layerInfo.dims[1];
291 mConfig.mTensorInfo.dim = layerInfo.dims[0];
292 mConfig.mTensorInfo.width = layerInfo.dims[3];
293 mConfig.mTensorInfo.height = layerInfo.dims[2];
294 } else if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NHWC) { // NHWC
295 mConfig.mTensorInfo.ch = layerInfo.dims[3];
296 mConfig.mTensorInfo.dim = layerInfo.dims[0];
297 mConfig.mTensorInfo.width = layerInfo.dims[2];
298 mConfig.mTensorInfo.height = layerInfo.dims[1];
300 LOGE("Invalid shape type[%d]", layerInfo.shapeType);
303 if (!mMetadata.GetInputMeta().GetOption().empty()) {
304 auto &option = mMetadata.GetInputMeta().GetOption().begin()->second;
305 if (option.normalization.use) {
306 mConfig.mMeanValue = option.normalization.mean[0];
307 mConfig.mStdValue = option.normalization.std[0];
311 mConfig.mDataType = layerInfo.dataType;
312 mConfig.mInputLayerNames.clear();
314 for (auto &layer : mMetadata.GetInputMeta().GetLayer())
315 mConfig.mInputLayerNames.push_back(layer.first);
317 int ret = setInputInfo();
324 int Inference::configureInputMetaInfo(MetaMap &inputMetaInfo)
328 LOGI("use input meta");
330 mConfig.mInputLayerNames.clear();
333 for (auto &meta : inputMetaInfo) {
334 std::shared_ptr<MetaInfo> metaInfo = meta.second;
336 mConfig.mTensorInfo.ch = metaInfo->getChannel();
337 mConfig.mTensorInfo.dim = metaInfo->dims[0];
338 mConfig.mTensorInfo.width = metaInfo->getWidth();
339 mConfig.mTensorInfo.height = metaInfo->getHeight();
342 std::static_pointer_cast<DecodingNormal>(metaInfo->decodingTypeMap[DecodingType::NORMAL]);
343 if (normalization && normalization->use) {
344 mConfig.mMeanValue = normalization->mean[0];
345 mConfig.mStdValue = normalization->std[0];
348 mConfig.mDataType = metaInfo->dataType;
349 mConfig.mInputLayerNames.push_back(meta.first);
351 } catch (const std::exception &e) {
352 LOGE("Fail to configure input meta info.");
353 return MEDIA_VISION_ERROR_INVALID_OPERATION;
356 int ret = setInputInfo();
363 int Inference::setInputInfo()
367 mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
369 inference_engine_layer_property property;
370 // In case of that a inference plugin deosn't support to get properties,
371 // the tensor info given by a user will be used.
372 // If the plugin supports that, the given info will be ignored.
374 for (auto &name : mConfig.mInputLayerNames) {
375 inference_engine_tensor_info tensor_info;
376 tensor_info.data_type = ConvertToIE(mConfig.mDataType);
378 // In case of OpenCV, only supports NCHW
379 tensor_info.shape_type = INFERENCE_TENSOR_SHAPE_NCHW;
380 // modify to handle multiple tensor infos
381 tensor_info.shape.push_back(mConfig.mTensorInfo.dim);
382 tensor_info.shape.push_back(mConfig.mTensorInfo.ch);
383 tensor_info.shape.push_back(mConfig.mTensorInfo.height);
384 tensor_info.shape.push_back(mConfig.mTensorInfo.width);
386 tensor_info.size = 1;
387 for (auto &dim : tensor_info.shape) {
388 tensor_info.size *= dim;
391 property.layers.insert(std::make_pair(name, tensor_info));
394 LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
395 LOGI("mean %.4f, deviation %.4f", mConfig.mMeanValue, mConfig.mStdValue);
396 LOGI("outputNumber %d", mConfig.mMaxOutputNumbers);
398 int ret = mBackend->SetInputLayerProperty(property);
399 if (ret != INFERENCE_ENGINE_ERROR_NONE)
400 LOGE("Fail to set input layer property");
407 int Inference::ConfigureOutputInfo(const std::vector<std::string> names,
408 std::vector<inference_engine_tensor_info> &tensors_info)
412 inference_engine_layer_property property;
414 mConfig.mOutputLayerNames = names;
416 if (tensors_info.empty()) {
417 inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
418 INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
420 for (auto &name : mConfig.mOutputLayerNames) {
421 LOGI("Configure %s layer as output", name.c_str());
422 property.layers.insert(std::make_pair(name, tensor_info));
425 if (mConfig.mOutputLayerNames.size() != tensors_info.size()) {
426 LOGE("Output layer count is different from tensor info count.");
427 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
430 for (size_t idx = 0; idx < mConfig.mOutputLayerNames.size(); ++idx) {
431 LOGI("Configure %s layer as output", mConfig.mOutputLayerNames[idx].c_str());
432 property.layers.insert(std::make_pair(mConfig.mOutputLayerNames[idx], tensors_info[idx]));
436 int ret = setOutputInfo(property);
443 int Inference::configureOutputMetaInfo()
447 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
449 mConfig.mOutputLayerNames.clear();
451 if (!outputMeta._tensor_info.empty()) {
452 for (auto &info : outputMeta._tensor_info)
453 mConfig.mOutputLayerNames.push_back(info.first);
456 inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
457 INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
458 inference_engine_layer_property property;
460 for (auto &name : mConfig.mOutputLayerNames) {
461 LOGI("Configure %s layer as output", name.c_str());
462 property.layers.insert(std::make_pair(name, tensor_info));
465 int ret = setOutputInfo(property);
472 int Inference::configureOutputMetaInfo(MetaMap &outputMetaInfo)
476 mConfig.mOutputLayerNames.clear();
479 for (auto &meta : outputMetaInfo) {
480 std::shared_ptr<MetaInfo> &metaInfo = meta.second;
482 mConfig.mDataType = metaInfo->dataType;
483 mConfig.mOutputLayerNames.push_back(meta.first);
485 } catch (const std::exception &e) {
486 LOGE("Fail to configure output meta info.");
487 return MEDIA_VISION_ERROR_INVALID_OPERATION;
490 inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
491 INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
492 inference_engine_layer_property property;
494 for (auto &name : mConfig.mOutputLayerNames) {
495 LOGI("Configure %s layer as output", name.c_str());
496 property.layers.insert(std::make_pair(name, tensor_info));
499 int ret = setOutputInfo(property);
506 int Inference::setOutputInfo(inference_engine_layer_property &property)
510 int ret = mBackend->SetOutputLayerProperty(property);
511 if (ret != INFERENCE_ENGINE_ERROR_NONE)
512 LOGE("Fail to set output layer property");
519 int Inference::CheckBackendType(const mv_inference_backend_type_e backendType)
521 // Check if a given backend type is valid or not.
522 if (backendType <= MV_INFERENCE_BACKEND_NONE || backendType >= MV_INFERENCE_BACKEND_MAX) {
523 LOGE("Invalid backend type.");
524 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
527 std::pair<std::string, bool> backend = mSupportedInferenceBackend[backendType];
528 if (backend.second == false) {
529 LOGE("%s type is not supported", (backend.first).c_str());
530 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
533 LOGI("backend engine : %d", backendType);
535 return MEDIA_VISION_ERROR_NONE;
538 int Inference::ConfigureTargetTypes(int targetType, bool isNewVersion)
541 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetType || MV_INFERENCE_TARGET_DEVICE_MAX <= targetType) {
542 LOGE("Invalid target device.");
543 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
546 if (MV_INFERENCE_TARGET_NONE >= targetType || MV_INFERENCE_TARGET_MAX <= targetType) {
547 LOGE("Invalid target device.");
548 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
551 LOGI("Before converting target types : %d", targetType);
553 // Convert old type to new one.
554 switch (targetType) {
555 case MV_INFERENCE_TARGET_CPU:
556 targetType = MV_INFERENCE_TARGET_DEVICE_CPU;
558 case MV_INFERENCE_TARGET_GPU:
560 targetType = MV_INFERENCE_TARGET_DEVICE_GPU;
562 case MV_INFERENCE_TARGET_CUSTOM:
563 targetType = MV_INFERENCE_TARGET_DEVICE_CUSTOM;
567 LOGI("After converting target types : %d", targetType);
570 mConfig.mTargetTypes = targetType;
572 return MEDIA_VISION_ERROR_NONE;
575 int Inference::ConfigureTargetDevices(const int targetDevices)
577 // Check if given target types are valid or not.
578 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetDevices || MV_INFERENCE_TARGET_DEVICE_MAX <= targetDevices) {
579 LOGE("Invalid target device.");
580 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
583 LOGI("target devices : %d", targetDevices);
585 if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
586 LOGE("Backend doesn't support a given device acceleration.");
587 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
590 mConfig.mTargetTypes = targetDevices;
592 return MEDIA_VISION_ERROR_NONE;
595 bool Inference::IsTargetDeviceSupported(const int targetDevices)
597 if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
598 LOGE("Backend doesn't support a given %x device acceleration.", targetDevices);
605 void Inference::ConfigureOutput(const int maxOutputNumbers)
607 mConfig.mMaxOutputNumbers =
608 std::max(std::min(maxOutputNumbers, MV_INFERENCE_OUTPUT_NUMBERS_MAX), MV_INFERENCE_OUTPUT_NUMBERS_MIN);
611 void Inference::ConfigureThreshold(const double threshold)
613 mConfig.mConfidenceThresHold =
614 std::max(std::min(threshold, MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX), MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN);
617 int Inference::ParseMetadata(const std::string filePath)
620 int ret = mMetadata.Init(filePath);
621 if (ret != MEDIA_VISION_ERROR_NONE) {
622 LOGE("Fail to init metadata[%d]", ret);
626 ret = mMetadata.Parse();
627 if (ret != MEDIA_VISION_ERROR_NONE) {
628 LOGE("Fail to parse metadata[%d]", ret);
634 return MEDIA_VISION_ERROR_NONE;
637 void Inference::CleanupTensorBuffers(void)
641 if (!mInputTensorBuffers.empty()) {
642 mInputTensorBuffers.release();
645 if (!mOutputTensorBuffers.empty()) {
646 mOutputTensorBuffers.release();
652 int Inference::PrepareTenosrBuffers(void)
654 // If there are input and output tensor buffers allocated before then release the buffers.
655 // They will be allocated again according to a new model file to be loaded.
656 CleanupTensorBuffers();
658 // IF model file is loaded again then the model type could be different so
659 // clean up input and output layer properties so that they can be updated again
660 // after reloading the model file.
661 if (!mInputLayerProperty.layers.empty()) {
662 mInputLayerProperty.layers.clear();
663 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
665 if (!mOutputLayerProperty.layers.empty()) {
666 mOutputLayerProperty.layers.clear();
667 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
670 // Get input tensor buffers from a backend engine if the backend engine allocated.
671 auto &inputTensorBuffers = mInputTensorBuffers.getIETensorBuffer();
672 int ret = mBackend->GetInputTensorBuffers(inputTensorBuffers);
673 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
674 LOGE("Fail to get input tensor buffers from backend engine.");
675 return ConvertEngineErrorToVisionError(ret);
678 ret = mBackend->GetInputLayerProperty(mInputLayerProperty);
679 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
680 LOGE("Fail to get input layer property from backend engine.");
681 return ConvertEngineErrorToVisionError(ret);
684 // If the backend engine isn't able to allocate input tensor buffers internally,
685 // then allocate the buffers at here.
686 if (mInputTensorBuffers.empty()) {
687 for (auto &layer : mInputLayerProperty.layers) {
688 inference_engine_tensor_buffer tensor_buffer;
690 ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
691 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
692 LOGE("Fail to allocate tensor buffer.");
693 mInputTensorBuffers.release();
697 mInputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
701 LOGI("Input tensor buffer count is %zu", mInputTensorBuffers.size());
703 // Get output tensor buffers from a backend engine if the backend engine allocated.
704 auto &outputTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
705 ret = mBackend->GetOutputTensorBuffers(outputTensorBuffers);
706 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
707 LOGE("Fail to get output tensor buffers from backend engine.");
708 return ConvertEngineErrorToVisionError(ret);
711 ret = mBackend->GetOutputLayerProperty(mOutputLayerProperty);
712 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
713 LOGE("Fail to get output layer property from backend engine.");
714 return ConvertEngineErrorToVisionError(ret);
717 // If the backend engine isn't able to allocate output tensor buffers internally,
718 // then allocate the buffers at here.
719 if (mOutputTensorBuffers.empty()) {
720 for (auto &layer : mOutputLayerProperty.layers) {
721 inference_engine_tensor_buffer tensor_buffer;
723 ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
724 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
725 LOGE("Fail to allocate tensor buffer.");
726 mInputTensorBuffers.release();
730 mOutputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
734 LOGI("Output tensor buffer count is %zu", mOutputTensorBuffers.size());
736 return MEDIA_VISION_ERROR_NONE;
739 int Inference::ConvertOutputDataTypeToFloat()
741 IETensorBuffer &ieTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
743 for (auto &ieTensorBuffer : ieTensorBuffers) {
744 auto &tensorBuffer = ieTensorBuffer.second;
746 // Normalize output tensor data converting it to float type in case of quantized model.
747 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT8) {
748 int ret = mOutputTensorBuffers.convertToFloat<unsigned char>(&tensorBuffer);
749 if (ret != MEDIA_VISION_ERROR_NONE) {
750 LOGE("Fail to convert tensor data to float type.");
755 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT16) {
756 int ret = mOutputTensorBuffers.convertToFloat<unsigned short>(&tensorBuffer);
757 if (ret != MEDIA_VISION_ERROR_NONE) {
758 LOGE("Fail to convert tensor data to float type.");
764 return MEDIA_VISION_ERROR_NONE;
767 int Inference::Bind(int backend_type, int device_type)
771 int ret = CheckBackendType(static_cast<mv_inference_backend_type_e>(backend_type));
772 if (ret != MEDIA_VISION_ERROR_NONE)
775 std::string backendName = mSupportedInferenceBackend[backend_type].first;
776 LOGI("backend string name: %s", backendName.c_str());
778 inference_engine_config config = {
779 .backend_name = backendName,
780 .backend_type = backend_type,
781 // As a default, Target device is CPU. If user defined desired device type in json file
782 // then the device type will be set by Load callback.
783 .target_devices = device_type,
786 // Create a backend class object.
788 mBackend = new InferenceEngineCommon();
790 #if ENABLE_INFERENCE_PROFILER
791 mBackend->EnableProfiler(true);
792 mBackend->DumpProfileToFile("profile_data_" + backendName + ".txt");
794 } catch (const std::bad_alloc &ex) {
795 LOGE("Fail to create backend : %s", ex.what());
796 return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
799 ret = MEDIA_VISION_ERROR_NONE;
801 // Load configuration file if a given backend type is mlapi.
802 if (config.backend_type == MV_INFERENCE_BACKEND_MLAPI) {
803 ret = mBackend->LoadConfigFile();
804 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
805 return MEDIA_VISION_ERROR_INVALID_OPERATION;
809 // Bind a backend library.
810 ret = mBackend->BindBackend(&config);
811 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
812 LOGE("Fail to bind backend library.(%d)", ret);
813 return MEDIA_VISION_ERROR_INVALID_OPERATION;
816 // Get capacity information from a backend.
817 ret = mBackend->GetBackendCapacity(&mBackendCapacity);
818 if (ret != MEDIA_VISION_ERROR_NONE) {
819 mBackend->UnbindBackend();
820 LOGE("Fail to get backend capacity.");
824 if (!IsTargetDeviceSupported(mConfig.mTargetTypes)) {
825 mBackend->UnbindBackend();
826 LOGE("Tried to configure invalid target types.");
827 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
832 return MEDIA_VISION_ERROR_NONE;
835 int Inference::Load(void)
839 std::string label_file = mConfig.mUserFilePath;
840 size_t userFileLength = label_file.length();
841 if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
842 LOGE("Label file path in [%s] ", label_file.c_str());
843 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
846 int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
847 if (ret != MEDIA_VISION_ERROR_NONE) {
848 LOGE("Fail to load label file.");
852 // Check if model file is valid or not.
853 std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
854 std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
855 if (key == mModelFormats.end()) {
856 LOGE("Invalid model file format.(ext = %s)", ext_str.c_str());
857 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
860 LOGI("%s model file has been detected.", ext_str.c_str());
862 std::vector<std::string> models;
864 inference_model_format_e model_format = static_cast<inference_model_format_e>(key->second);
866 // Push model file information to models vector properly according to detected model format.
867 switch (model_format) {
868 case INFERENCE_MODEL_CAFFE:
869 case INFERENCE_MODEL_TF:
870 case INFERENCE_MODEL_DARKNET:
871 case INFERENCE_MODEL_DLDT:
872 case INFERENCE_MODEL_ONNX:
873 case INFERENCE_MODEL_VIVANTE:
874 models.push_back(mConfig.mWeightFilePath);
875 models.push_back(mConfig.mConfigFilePath);
877 case INFERENCE_MODEL_TFLITE:
878 case INFERENCE_MODEL_TORCH:
879 case INFERENCE_MODEL_NNTRAINER:
880 case INFERENCE_MODEL_SNPE:
881 models.push_back(mConfig.mWeightFilePath);
887 // Request model loading to backend engine.
888 ret = mBackend->Load(models, model_format);
889 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
890 LOGE("Fail to load model");
892 std::vector<std::string>().swap(models);
893 return ConvertEngineErrorToVisionError(ret);
896 std::vector<std::string>().swap(models);
898 // Prepare input and output tensor buffers.
899 ret = PrepareTenosrBuffers();
900 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
901 LOGE("Fail to prepare buffer");
910 return ConvertEngineErrorToVisionError(ret);
913 int Inference::Preprocess(std::vector<mv_source_h> &mv_sources, std::vector<cv::Mat> &cv_sources)
915 unsigned int src_idx = 0;
917 for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
918 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
919 int data_type = ConvertToCv(tensor_buffer.data_type);
922 mv_colorspace_e colorspace = MEDIA_VISION_COLORSPACE_INVALID;
924 int ret = mv_source_get_colorspace(mv_sources[src_idx], &colorspace);
925 if (ret != MEDIA_VISION_ERROR_NONE) {
926 LOGE("Fail to get color space.");
930 if (mMetadata.GetInputMeta().IsParsed()) {
931 layerInfo = mMetadata.GetInputMeta().GetLayer().at(buffer.first);
933 if (!mMetadata.GetInputMeta().GetOption().empty())
934 opt = mMetadata.GetInputMeta().GetOption().at(buffer.first);
936 // Ps. in case of legacy way, there is no way to set model specific dequantization parameters - zero point and scale.
937 // TODO. find a proper way for it.
938 opt.normalization.use = true;
939 opt.normalization.mean.assign(3, mConfig.mMeanValue);
940 opt.normalization.std.assign(3, mConfig.mStdValue);
942 layerInfo.name = buffer.first;
943 layerInfo.dims.push_back(mConfig.mTensorInfo.dim);
944 layerInfo.dims.push_back(mConfig.mTensorInfo.height);
945 layerInfo.dims.push_back(mConfig.mTensorInfo.width);
946 layerInfo.dims.push_back(mConfig.mTensorInfo.ch);
948 // Ps. in case of legacy way, there is no way to use model specific color space but only fixed one.
949 // TODO. find a proper way for it.
950 layerInfo.colorSpace = MEDIA_VISION_COLORSPACE_RGB888;
951 layerInfo.dataType = mConfig.mDataType;
952 // TODO. find a proper way for setting the shape type. In case of legacy way, there is no way to change the shape type properly.
953 // According to a given inference engine, different shape type can be needed.
954 layerInfo.shapeType = INFERENCE_TENSOR_SHAPE_NHWC;
957 // TODO: try-catch{} error handling
958 ret = mPreProc.Run(cv_sources[src_idx++], colorspace, data_type, layerInfo, opt, tensor_buffer.buffer);
959 if (ret != MEDIA_VISION_ERROR_NONE) {
960 LOGE("Fail to run pre-process.");
965 return MEDIA_VISION_ERROR_NONE;
968 int Inference::Run(std::vector<mv_source_h> &mvSources, std::vector<mv_rectangle_s> &rects)
970 int ret = INFERENCE_ENGINE_ERROR_NONE;
973 LOGE("Invalid to run inference");
974 return MEDIA_VISION_ERROR_INVALID_OPERATION;
977 if (mvSources.empty()) {
978 LOGE("mvSources should contain only one cv source.");
979 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
982 // We are able to request Only one input data for the inference as of now.
983 if (mvSources.size() > 1) {
984 LOGE("It allows only one mv source for the inference.");
985 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
988 if (!rects.empty() && rects.size() != mvSources.size()) {
989 LOGE("mvSources.size() should be same as rects.size() if rects.empty() is false.");
990 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
993 if (mConfig.mTensorInfo.ch != 1 && mConfig.mTensorInfo.ch != 3) {
994 LOGE("Channel not supported.");
995 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
998 std::vector<cv::Mat> cvSources;
1000 ret = ConvertToCvSource(mvSources, cvSources, rects);
1001 if (ret != MEDIA_VISION_ERROR_NONE) {
1002 LOGE("Fail to convert mv source to cv source.");
1006 // mSourceSize is original input image's size
1007 // TODO. consider multiple cv sources.
1008 mSourceSize = cvSources[0].size();
1010 ret = Preprocess(mvSources, cvSources);
1011 if (ret != MEDIA_VISION_ERROR_NONE) {
1012 LOGE("Fail to preprocess given input sources.");
1016 ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1017 if (ret != INFERENCE_ENGINE_ERROR_NONE)
1020 return ConvertOutputDataTypeToFloat();
1023 int Inference::Run(std::vector<void *> &buffer_objs)
1025 int ret = INFERENCE_ENGINE_ERROR_NONE;
1028 LOGE("Invalid to run inference");
1029 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1032 if (buffer_objs.empty()) {
1033 LOGE("cvSources should contain only one cv source.");
1034 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1037 // We are able to request Only one input data for the inference as of now.
1038 if (buffer_objs.size() > 1) {
1039 LOGE("It allows only one source for the inference.");
1040 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1043 if (mInputTensorBuffers.getIETensorBuffer().size() != buffer_objs.size()) {
1044 LOGE("Raw source count is not invalid.");
1045 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1048 unsigned int buffer_idx = 0;
1050 for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
1051 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
1052 inference_engine_tensor_buffer *buffer_obj =
1053 static_cast<inference_engine_tensor_buffer *>(buffer_objs[buffer_idx]);
1055 if (tensor_buffer.size != buffer_obj->size) {
1056 LOGE("Raw buffer size is invalid.");
1057 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1060 LOGI("A number of tensor bytes : %zu", buffer_obj->size);
1062 memcpy(tensor_buffer.buffer, buffer_obj->buffer, tensor_buffer.size);
1066 ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1067 if (ret != INFERENCE_ENGINE_ERROR_NONE)
1070 return ConvertOutputDataTypeToFloat();
1073 int Inference::Run()
1075 int ret = INFERENCE_ENGINE_ERROR_NONE;
1078 LOGE("Invalid to run inference");
1079 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1082 ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1083 if (ret != INFERENCE_ENGINE_ERROR_NONE)
1086 return ConvertOutputDataTypeToFloat();
1089 std::pair<std::string, bool> Inference::GetSupportedInferenceBackend(int backend)
1091 return mSupportedInferenceBackend[backend];
1094 int Inference::GetClassficationResults(ImageClassificationResults *results)
1096 // Will contain top N results in ascending order.
1097 std::vector<std::pair<float, int> > topScore;
1098 auto threadHold = mConfig.mConfidenceThresHold;
1099 constexpr unsigned int default_top_number = 5;
1100 tensor_t outputTensorInfo;
1102 // Get inference result and contain it to outputTensorInfo.
1103 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1104 if (ret != MEDIA_VISION_ERROR_NONE) {
1105 LOGE("Fail to get output result.");
1109 PostProcess postProc;
1110 unsigned int classes = outputTensorInfo.dimInfo[0][1];
1111 unsigned int top_number = default_top_number;
1113 if (mMetadata.GetOutputMeta().IsParsed()) {
1114 OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1115 std::vector<int> indexes = outputMetadata.GetScoreDimInfo().GetValidIndexAll();
1117 if (indexes.size() != 1) {
1118 LOGE("Invalid dim size. It should be 1");
1119 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1122 if (!mOutputTensorBuffers.exist(outputMetadata.GetScoreName())) {
1123 LOGE("output buffe is NULL");
1124 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1127 top_number = outputMetadata.GetScoreTopNumber();
1128 threadHold = outputMetadata.GetScoreThreshold();
1130 classes = mOutputLayerProperty.layers[outputMetadata.GetScoreName()].shape[indexes[0]];
1133 postProc.ScoreClear(top_number);
1135 auto *prediction = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1137 LOGI("class count: %d", classes);
1139 for (unsigned int idx = 0; idx < classes; ++idx) {
1140 float value = prediction[idx];
1142 if (mMetadata.GetOutputMeta().IsParsed()) {
1143 OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1145 if (outputMetadata.GetScoreDeQuant()) {
1146 value = PostProcess::dequant(value, outputMetadata.GetScoreDeQuantScale(),
1147 outputMetadata.GetScoreDeQuantZeroPoint());
1150 if (outputMetadata.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID)
1151 value = PostProcess::sigmoid(value);
1154 if (value < threadHold)
1157 postProc.ScorePush(value, idx);
1160 postProc.ScorePop(topScore);
1161 results->number_of_classes = 0;
1163 for (auto &score : topScore) {
1164 LOGI("score: %.3f, threshold: %.3f", score.first, threadHold);
1165 LOGI("idx:%d", score.second);
1166 LOGI("classProb: %.3f", score.first);
1168 results->indices.push_back(score.second);
1169 results->confidences.push_back(score.first);
1170 results->names.push_back(mUserListName[score.second]);
1171 results->number_of_classes++;
1174 LOGE("Inference: GetClassificationResults: %d\n", results->number_of_classes);
1175 return MEDIA_VISION_ERROR_NONE;
1178 int Inference::GetObjectDetectionResults(ObjectDetectionResults *results)
1180 if (mMetadata.GetOutputMeta().IsParsed()) {
1181 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1184 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1185 !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1186 LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1187 outputMeta.GetScoreName().c_str());
1188 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1191 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1192 if (boxIndexes.size() != 1) {
1193 LOGE("Invalid dim size. It should be 1");
1194 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1197 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1198 int numberOfObjects = 0;
1200 if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
1201 std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1202 if (scoreIndexes.size() != 1) {
1203 LOGE("Invalid dim size. It should be 1");
1204 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1206 numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1209 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1210 static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1211 static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1215 objDecoder.decode();
1216 results->number_of_objects = 0;
1218 auto &rLoc = results->locations;
1220 for (auto &box : objDecoder.getObjectAll()) {
1221 results->indices.push_back(box.index);
1222 results->names.push_back(mUserListName[box.index]);
1223 results->confidences.push_back(box.score);
1224 auto &bLoc = box.location;
1226 auto srcW = static_cast<double>(mSourceSize.width);
1227 auto srcH = static_cast<double>(mSourceSize.height);
1229 auto halfW = (bLoc.x - bLoc.width * 0.5f);
1230 auto halfH = (bLoc.y - bLoc.height * 0.5f);
1232 if (mMetadata.GetInputMeta().option.begin()->second.resizer == Resizer::LETTERBOX) {
1233 double dstW = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getWidth());
1234 double dstH = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getHeight());
1235 double scale = std::min(1.0, std::min(dstW / srcW, dstH / srcH));
1236 double padSize[] = { (dstW - (scale * srcW)) / 2.0, (dstH - (scale * srcH)) / 2.0 };
1239 cv::Rect(static_cast<int>(std::min(srcW, std::max((halfW * dstW - padSize[0]) / scale, 0.0))),
1240 static_cast<int>(std::min(srcH, std::max((halfH * dstH - padSize[1]) / scale, 0.0))),
1241 static_cast<int>((bLoc.width * dstW) / scale + padSize[0]),
1242 static_cast<int>((bLoc.height * dstH) / scale + padSize[1]));
1244 rect.width = (rect.x + rect.width) > srcW ? srcW - rect.x : rect.width;
1245 rect.height = (rect.y + rect.height) > srcH ? srcH - rect.y : rect.height;
1247 rLoc.push_back(rect);
1249 rLoc.push_back(cv::Rect(halfW * srcW, halfH * srcH, bLoc.width * srcW, bLoc.height * srcH));
1251 results->number_of_objects++;
1254 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1256 tensor_t outputTensorInfo;
1258 // Get inference result and contain it to outputTensorInfo.
1259 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1260 if (ret != MEDIA_VISION_ERROR_NONE) {
1261 LOGE("Fail to get output result.");
1265 // In case of object detection,
1266 // a model may apply post-process but others may not.
1267 // Thus, those cases should be hanlded separately.
1269 float *boxes = nullptr;
1270 float *classes = nullptr;
1271 float *scores = nullptr;
1272 int number_of_detections = 0;
1274 if (outputTensorInfo.dimInfo.size() == 1) {
1275 // there is no way to know how many objects are detect unless the number of objects aren't
1276 // provided. In the case, each backend should provide the number of results manually.
1277 // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1278 // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1279 // indicates the image id. But it is useless if a batch mode isn't supported.
1280 // So, use the 1st of 7.
1282 number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1283 cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1284 outputTensorInfo.data[0]);
1287 cv::Mat cvLeft = cvOutputData.col(3).clone();
1288 cv::Mat cvTop = cvOutputData.col(4).clone();
1289 cv::Mat cvRight = cvOutputData.col(5).clone();
1290 cv::Mat cvBottom = cvOutputData.col(6).clone();
1291 cv::Mat cvScores, cvClasses, cvBoxes;
1292 cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1294 cv::hconcat(cvBoxElems, 4, cvBoxes);
1297 cvClasses = cvOutputData.col(1).clone();
1300 cvScores = cvOutputData.col(2).clone();
1302 boxes = cvBoxes.ptr<float>(0);
1303 classes = cvClasses.ptr<float>(0);
1304 scores = cvScores.ptr<float>(0);
1306 boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1307 classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1308 scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1309 number_of_detections = (int) (*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1312 LOGI("number_of_detections = %d", number_of_detections);
1314 results->number_of_objects = 0;
1316 for (int idx = 0; idx < number_of_detections; ++idx) {
1317 if (scores[idx] < mConfig.mConfidenceThresHold)
1320 int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1321 int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1322 int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1323 int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1328 loc.width = right - left + 1;
1329 loc.height = bottom - top + 1;
1331 results->indices.push_back(static_cast<int>(classes[idx]));
1332 results->confidences.push_back(scores[idx]);
1333 results->names.push_back(mUserListName[static_cast<int>(classes[idx])]);
1334 results->locations.push_back(loc);
1335 results->number_of_objects++;
1337 LOGI("objectClass: %d", static_cast<int>(classes[idx]));
1338 LOGI("confidence:%f", scores[idx]);
1339 LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1342 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1345 return MEDIA_VISION_ERROR_NONE;
1348 int Inference::GetFaceDetectionResults(FaceDetectionResults *results)
1350 if (mMetadata.GetOutputMeta().IsParsed()) {
1351 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1354 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1355 !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1356 LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1357 outputMeta.GetScoreName().c_str());
1358 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1361 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1362 if (boxIndexes.size() != 1) {
1363 LOGE("Invalid dim size. It should be 1");
1364 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1367 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1368 int numberOfFaces = 0;
1370 if (outputMeta.GetBoxDecodingType() != INFERENCE_BOX_DECODING_TYPE_BYPASS) {
1371 std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1372 if (scoreIndexes.size() != 1) {
1373 LOGE("Invaid dim size. It should be 1");
1374 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1376 numberOfFaces = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1379 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1380 static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1381 static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1385 objDecoder.decode();
1386 results->number_of_faces = 0;
1388 for (auto &face : objDecoder.getObjectAll()) {
1389 results->confidences.push_back(face.score);
1390 results->locations.push_back(
1391 cv::Rect(static_cast<int>((face.location.x - face.location.width * 0.5f) *
1392 static_cast<float>(mSourceSize.width)),
1393 static_cast<int>((face.location.y - face.location.height * 0.5f) *
1394 static_cast<float>(mSourceSize.height)),
1395 static_cast<int>(face.location.width * static_cast<float>(mSourceSize.width)),
1396 static_cast<int>(face.location.height * static_cast<float>(mSourceSize.height))));
1397 results->number_of_faces++;
1400 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1402 tensor_t outputTensorInfo;
1404 // Get inference result and contain it to outputTensorInfo.
1405 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1406 if (ret != MEDIA_VISION_ERROR_NONE) {
1407 LOGE("Fail to get output result.");
1411 // In case of object detection,
1412 // a model may apply post-process but others may not.
1413 // Thus, those cases should be handled separately.
1415 float *boxes = nullptr;
1416 float *classes = nullptr;
1417 float *scores = nullptr;
1418 int number_of_detections = 0;
1419 cv::Mat cvScores, cvClasses, cvBoxes;
1421 if (outputTensorInfo.dimInfo.size() == 1) {
1422 // there is no way to know how many objects are detect unless the number of objects aren't
1423 // provided. In the case, each backend should provide the number of results manually.
1424 // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1425 // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1426 // indicates the image id. But it is useless if a batch mode isn't supported.
1427 // So, use the 1st of 7.
1429 number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1430 cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1431 outputTensorInfo.data[0]);
1434 cv::Mat cvLeft = cvOutputData.col(3).clone();
1435 cv::Mat cvTop = cvOutputData.col(4).clone();
1436 cv::Mat cvRight = cvOutputData.col(5).clone();
1437 cv::Mat cvBottom = cvOutputData.col(6).clone();
1438 cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1439 cv::hconcat(cvBoxElems, 4, cvBoxes);
1442 cvClasses = cvOutputData.col(1).clone();
1445 cvScores = cvOutputData.col(2).clone();
1447 boxes = cvBoxes.ptr<float>(0);
1448 classes = cvClasses.ptr<float>(0);
1449 scores = cvScores.ptr<float>(0);
1451 boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1452 classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1453 scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1454 number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1457 results->number_of_faces = 0;
1459 for (int idx = 0; idx < number_of_detections; ++idx) {
1460 if (scores[idx] < mConfig.mConfidenceThresHold)
1463 int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1464 int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1465 int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1466 int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1471 loc.width = right - left + 1;
1472 loc.height = bottom - top + 1;
1473 results->confidences.push_back(scores[idx]);
1474 results->locations.push_back(loc);
1475 results->number_of_faces++;
1477 LOGI("confidence:%f", scores[idx]);
1478 LOGI("class: %f", classes[idx]);
1479 LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx * 4 + 1], boxes[idx * 4 + 0], boxes[idx * 4 + 3],
1480 boxes[idx * 4 + 2]);
1481 LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1484 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1487 return MEDIA_VISION_ERROR_NONE;
1490 int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *results)
1494 if (mMetadata.GetOutputMeta().IsParsed()) {
1495 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1497 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1498 !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1499 LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1500 outputMeta.GetScoreName().c_str());
1501 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1504 int heatMapWidth = 0;
1505 int heatMapHeight = 0;
1506 int heatMapChannel = 0;
1507 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1508 int number_of_landmarks = heatMapChannel;
1510 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
1511 LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
1512 number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1513 outputMeta.GetLandmarkOffset();
1514 } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
1515 number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1517 heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1518 .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1519 heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1520 .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1521 heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1522 .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1525 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1528 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1529 number_of_landmarks);
1531 // initialize decorder queue with landmarks to be decoded.
1532 int ret = poseDecoder.init();
1533 if (ret != MEDIA_VISION_ERROR_NONE) {
1534 LOGE("Fail to init poseDecoder");
1541 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1542 inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1543 inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1546 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1548 outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1550 poseDecoder.decode(inputW, inputH, thresRadius);
1552 for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
1553 results->locations.push_back(
1554 cv::Point(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
1555 poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height)));
1558 results->number_of_landmarks = results->locations.size();
1560 tensor_t outputTensorInfo;
1562 // Get inference result and contain it to outputTensorInfo.
1563 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1564 if (ret != MEDIA_VISION_ERROR_NONE) {
1565 LOGE("Fail to get output result.");
1569 int number_of_detections = outputTensorInfo.dimInfo[0][1] >> 1;
1571 results->number_of_landmarks = number_of_detections;
1572 results->locations.resize(number_of_detections);
1574 LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
1576 float *loc = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1578 for (auto &point : results->locations) {
1579 point.x = static_cast<int>(*loc++ * mSourceSize.width);
1580 point.y = static_cast<int>(*loc++ * mSourceSize.height);
1582 LOGI("x:%d, y:%d", point.x, point.y);
1586 LOGI("Inference: FacialLandmarkDetectionResults: %d\n", results->number_of_landmarks);
1587 return MEDIA_VISION_ERROR_NONE;
1590 int Inference::GetPoseLandmarkDetectionResults(std::unique_ptr<mv_inference_pose_s> &detectionResults, int width,
1595 auto poseResult = std::make_unique<mv_inference_pose_s>();
1597 if (mMetadata.GetOutputMeta().IsParsed()) {
1598 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1600 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1601 !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1602 LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1603 outputMeta.GetScoreName().c_str());
1604 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1607 int heatMapWidth = 0;
1608 int heatMapHeight = 0;
1609 int heatMapChannel = 0;
1611 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
1612 outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
1613 heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1614 .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1615 heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1616 .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1617 heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1618 .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1621 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1623 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1625 // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
1626 // the landmarkChannel is guessed from the shape of the landmark output tensor.
1627 // Otherwise, it is guessed from the heatMapChannel. (heatMapChannel is used in default).
1628 int landmarkChannel = heatMapChannel;
1630 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS)
1631 landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1632 outputMeta.GetLandmarkOffset();
1633 else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL)
1634 landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1636 poseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
1637 static_cast<int>(mUserListName.size());
1639 LOGE("number of landmarks per pose: %d", poseResult->number_of_landmarks_per_pose);
1641 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1642 LOGE("Exceeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1643 MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1644 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1648 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1649 poseResult->number_of_landmarks_per_pose);
1651 // initialize decorder queue with landmarks to be decoded.
1652 int ret = poseDecoder.init();
1653 if (ret != MEDIA_VISION_ERROR_NONE) {
1654 LOGE("Fail to init poseDecoder");
1660 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1662 outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1663 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1664 inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1665 inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1668 poseDecoder.decode(inputW, inputH, thresRadius);
1669 poseResult->number_of_poses = poseDecoder.getNumberOfPose();
1671 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1672 for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; ++landmarkIndex) {
1673 int part = landmarkIndex;
1674 if (!mUserListName.empty()) {
1675 part = std::stoi(mUserListName[landmarkIndex]) - 1;
1681 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1682 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1683 poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
1684 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1685 poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
1686 poseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
1687 poseResult->landmarks[poseIndex][landmarkIndex].score = poseDecoder.getScore(poseIndex, part);
1691 detectionResults = std::move(poseResult);
1693 tensor_t outputTensorInfo;
1695 // Get inference result and contain it to outputTensorInfo.
1696 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1697 if (ret != MEDIA_VISION_ERROR_NONE) {
1698 LOGE("Fail to get output result.");
1702 cv::Mat reShapeTest(cv::Size(outputTensorInfo.dimInfo[0][2], outputTensorInfo.dimInfo[0][1]),
1703 CV_32FC(outputTensorInfo.dimInfo[0][3]), outputTensorInfo.data[0]);
1704 cv::Mat multiChannels[outputTensorInfo.dimInfo[0][3]];
1706 split(reShapeTest, multiChannels);
1708 float ratioX = static_cast<float>(outputTensorInfo.dimInfo[0][2]);
1709 float ratioY = static_cast<float>(outputTensorInfo.dimInfo[0][1]);
1711 poseResult->number_of_poses = 1;
1712 poseResult->number_of_landmarks_per_pose = outputTensorInfo.dimInfo[0][3];
1714 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1715 LOGE("Exeeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1716 MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1717 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1720 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1721 for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; landmarkIndex++) {
1722 int part = landmarkIndex;
1723 if (!mUserListName.empty()) {
1724 part = std::stoi(mUserListName[landmarkIndex]) - 1;
1730 cv::Mat heatMap = multiChannels[part];
1734 cv::Mat blurredHeatMap;
1736 cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
1737 cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
1739 loc2f.x = (static_cast<float>(loc.x) / ratioX);
1740 loc2f.y = (static_cast<float>(loc.y) / ratioY);
1742 LOGI("landmarkIndex[%2d] - mapping to [%2d]: x[%.3f], y[%.3f], score[%.3f]", landmarkIndex, part,
1743 loc2f.x, loc2f.y, score);
1745 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1746 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1747 static_cast<int>(static_cast<float>(width) * loc2f.x);
1748 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1749 static_cast<int>(static_cast<float>(height) * loc2f.y);
1750 poseResult->landmarks[poseIndex][landmarkIndex].score = score;
1751 poseResult->landmarks[poseIndex][landmarkIndex].label = -1;
1755 detectionResults = std::move(poseResult);
1758 return MEDIA_VISION_ERROR_NONE;