a3917d5ba809f2c1108632d1057cadd0f31bd6f3
[platform/core/api/mediavision.git] / mv_machine_learning / inference / src / Inference.cpp
1 /**
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "mv_private.h"
18 #include "Inference.h"
19 #include "InferenceIni.h"
20 #include "ObjectDecoder.h"
21 #include "PoseDecoder.h"
22 #include "util.h"
23 #include <map>
24 #include <list>
25
26 #include <unistd.h>
27 #include <fstream>
28 #include <string>
29 #include <queue>
30 #include <algorithm>
31
32 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
33 #define MV_INFERENCE_OUTPUT_NUMBERS_MIN 1
34 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
35 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
36
37 typedef enum {
38         InputAttrNoType = 0,
39         InputAttrFloat32 = 1,
40         InputAttrInt32 = 2,
41         InputAttrUInt8 = 3,
42         InputAttrInt64 = 4,
43         InputAttrString = 5,
44         InputAttrBool = 6,
45 } InputAttrType;
46
47 using namespace mediavision::common::util;
48 using namespace mediavision::machine_learning;
49
50 namespace mediavision
51 {
52 namespace inference
53 {
54 InferenceConfig::InferenceConfig()
55                 : mConfigFilePath()
56                 , mWeightFilePath()
57                 , mUserFilePath()
58                 , mDataType(MV_INFERENCE_DATA_FLOAT32)
59                 , mTargetTypes(MV_INFERENCE_TARGET_DEVICE_CPU)
60                 , mConfidenceThresHold()
61                 , mMeanValue()
62                 , mStdValue()
63                 , mMaxOutputNumbers(1)
64 {
65         mTensorInfo.width = -1;
66         mTensorInfo.height = -1;
67         mTensorInfo.dim = -1;
68         mTensorInfo.ch = -1;
69 }
70
71 Inference::Inference()
72 {
73         LOGI("ENTER");
74
75         CheckSupportedInferenceBackend();
76
77         for (auto &backend : mSupportedInferenceBackend) {
78                 LOGI("%s: %s", backend.second.first.c_str(), backend.second.second ? "TRUE" : "FALSE");
79         }
80         LOGI("LEAVE");
81 }
82
83 Inference::~Inference()
84 {
85         CleanupTensorBuffers();
86
87         if (!mInputLayerProperty.layers.empty()) {
88                 mInputLayerProperty.layers.clear();
89                 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
90         }
91         if (!mOutputLayerProperty.layers.empty()) {
92                 mOutputLayerProperty.layers.clear();
93                 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
94         }
95
96         mModelFormats.clear();
97
98         // Release backend engine.
99         if (mBackend) {
100                 mBackend->UnbindBackend();
101                 delete mBackend;
102         }
103
104         LOGI("Released backend engine.");
105 }
106
107 void Inference::CheckSupportedInferenceBackend()
108 {
109         LOGI("ENTER");
110
111         InferenceInI ini;
112         ini.LoadInI();
113
114         std::vector<int> supportedBackend = ini.GetSupportedInferenceEngines();
115         for (auto &backend : supportedBackend) {
116                 LOGI("engine: %d", backend);
117
118                 mSupportedInferenceBackend[backend].second = true;
119         }
120
121         LOGI("LEAVE");
122 }
123
124 int Inference::ConvertEngineErrorToVisionError(int error)
125 {
126         int ret = MEDIA_VISION_ERROR_NONE;
127
128         switch (error) {
129         case INFERENCE_ENGINE_ERROR_NONE:
130                 ret = MEDIA_VISION_ERROR_NONE;
131                 break;
132         case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED:
133                 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED;
134                 break;
135         case INFERENCE_ENGINE_ERROR_MSG_TOO_LONG:
136                 ret = MEDIA_VISION_ERROR_MSG_TOO_LONG;
137                 break;
138         case INFERENCE_ENGINE_ERROR_NO_DATA:
139                 ret = MEDIA_VISION_ERROR_NO_DATA;
140                 break;
141         case INFERENCE_ENGINE_ERROR_KEY_NOT_AVAILABLE:
142                 ret = MEDIA_VISION_ERROR_KEY_NOT_AVAILABLE;
143                 break;
144         case INFERENCE_ENGINE_ERROR_OUT_OF_MEMORY:
145                 ret = MEDIA_VISION_ERROR_OUT_OF_MEMORY;
146                 break;
147         case INFERENCE_ENGINE_ERROR_INVALID_PARAMETER:
148                 ret = MEDIA_VISION_ERROR_INVALID_PARAMETER;
149                 break;
150         case INFERENCE_ENGINE_ERROR_INVALID_OPERATION:
151                 ret = MEDIA_VISION_ERROR_INVALID_OPERATION;
152                 break;
153         case INFERENCE_ENGINE_ERROR_PERMISSION_DENIED:
154                 ret = MEDIA_VISION_ERROR_PERMISSION_DENIED;
155                 break;
156         case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED_FORMAT:
157                 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
158                 break;
159         case INFERENCE_ENGINE_ERROR_INTERNAL:
160                 ret = MEDIA_VISION_ERROR_INTERNAL;
161                 break;
162         case INFERENCE_ENGINE_ERROR_INVALID_DATA:
163                 ret = MEDIA_VISION_ERROR_INVALID_DATA;
164                 break;
165         case INFERENCE_ENGINE_ERROR_INVALID_PATH:
166                 ret = MEDIA_VISION_ERROR_INVALID_PATH;
167                 break;
168         default:
169                 LOGE("Unknown inference engine error type");
170         }
171
172         return ret;
173 }
174
175 int Inference::ConvertTargetTypes(int given_types)
176 {
177         int target_types = INFERENCE_TARGET_NONE;
178
179         if (given_types & MV_INFERENCE_TARGET_DEVICE_CPU)
180                 target_types |= INFERENCE_TARGET_CPU;
181         if (given_types & MV_INFERENCE_TARGET_DEVICE_GPU)
182                 target_types |= INFERENCE_TARGET_GPU;
183         if (given_types & MV_INFERENCE_TARGET_DEVICE_CUSTOM)
184                 target_types |= INFERENCE_TARGET_CUSTOM;
185
186         return target_types;
187 }
188
189 int Inference::ConvertToCv(int given_type)
190 {
191         int type = 0;
192         const int ch = mConfig.mTensorInfo.ch;
193
194         switch (given_type) {
195         case INFERENCE_TENSOR_DATA_TYPE_UINT8:
196                 LOGI("Type is %d ch with UINT8", ch);
197                 type = ch == 1 ? CV_8UC1 : CV_8UC3;
198                 break;
199         case INFERENCE_TENSOR_DATA_TYPE_FLOAT32:
200                 LOGI("Type is %d ch with FLOAT32", ch);
201                 type = ch == 1 ? CV_32FC1 : CV_32FC3;
202                 break;
203         default:
204                 LOGI("unknown data type so FLOAT32 data type will be used in default");
205                 type = ch == 1 ? CV_32FC1 : CV_32FC3;
206                 break;
207         }
208
209         return type;
210 }
211
212 inference_tensor_data_type_e Inference::ConvertToIE(int given_type)
213 {
214         inference_tensor_data_type_e type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
215
216         switch (given_type) {
217         case MV_INFERENCE_DATA_FLOAT32:
218                 type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
219                 break;
220         case MV_INFERENCE_DATA_UINT8:
221                 type = INFERENCE_TENSOR_DATA_TYPE_UINT8;
222                 break;
223         default:
224                 LOGI("unknown data type so FLOAT32 data type will be used in default");
225                 break;
226         }
227
228         return type;
229 }
230
231 int Inference::SetUserFile(std::string filename)
232 {
233         std::ifstream fp(filename.c_str());
234         if (!fp.is_open()) {
235                 return MEDIA_VISION_ERROR_INVALID_PATH;
236         }
237
238         std::string userListName;
239         while (!fp.eof()) {
240                 std::getline(fp, userListName);
241                 if (userListName.length())
242                         mUserListName.push_back(userListName);
243         }
244
245         fp.close();
246
247         return MEDIA_VISION_ERROR_NONE;
248 }
249
250 void Inference::ConfigureModelFiles(const std::string modelConfigFilePath, const std::string modelWeightFilePath,
251                                                                         const std::string modelUserFilePath)
252 {
253         LOGI("ENTER");
254
255         mConfig.mConfigFilePath = modelConfigFilePath;
256         mConfig.mWeightFilePath = modelWeightFilePath;
257         mConfig.mUserFilePath = modelUserFilePath;
258
259         LOGI("LEAVE");
260 }
261
262 int Inference::ConfigureInputInfo(int width, int height, int dim, int ch, double stdValue, double meanValue,
263                                                                   int dataType, const std::vector<std::string> names)
264 {
265         LOGI("ENTER");
266
267         // FIXME: mConfig should be removed
268         mConfig.mTensorInfo = { width, height, dim, ch };
269         mConfig.mStdValue = stdValue;
270         mConfig.mMeanValue = meanValue;
271         mConfig.mDataType = static_cast<mv_inference_data_type_e>(dataType);
272         mConfig.mInputLayerNames = names;
273
274         int ret = setInputInfo();
275
276         LOGI("LEAVE");
277
278         return ret;
279 }
280
281 int Inference::configureInputMetaInfo()
282 {
283         LOGI("ENTER");
284
285         LOGI("use input meta");
286
287         auto &layerInfo = mMetadata.GetInputMeta().GetLayer().begin()->second;
288
289         if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NCHW) { // NCHW
290                 mConfig.mTensorInfo.ch = layerInfo.dims[1];
291                 mConfig.mTensorInfo.dim = layerInfo.dims[0];
292                 mConfig.mTensorInfo.width = layerInfo.dims[3];
293                 mConfig.mTensorInfo.height = layerInfo.dims[2];
294         } else if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NHWC) { // NHWC
295                 mConfig.mTensorInfo.ch = layerInfo.dims[3];
296                 mConfig.mTensorInfo.dim = layerInfo.dims[0];
297                 mConfig.mTensorInfo.width = layerInfo.dims[2];
298                 mConfig.mTensorInfo.height = layerInfo.dims[1];
299         } else {
300                 LOGE("Invalid shape type[%d]", layerInfo.shapeType);
301         }
302
303         if (!mMetadata.GetInputMeta().GetOption().empty()) {
304                 auto &option = mMetadata.GetInputMeta().GetOption().begin()->second;
305                 if (option.normalization.use) {
306                         mConfig.mMeanValue = option.normalization.mean[0];
307                         mConfig.mStdValue = option.normalization.std[0];
308                 }
309         }
310
311         mConfig.mDataType = layerInfo.dataType;
312         mConfig.mInputLayerNames.clear();
313
314         for (auto &layer : mMetadata.GetInputMeta().GetLayer())
315                 mConfig.mInputLayerNames.push_back(layer.first);
316
317         int ret = setInputInfo();
318
319         LOGI("LEAVE");
320
321         return ret;
322 }
323
324 int Inference::configureInputMetaInfo(MetaMap &inputMetaInfo)
325 {
326         LOGI("ENTER");
327
328         LOGI("use input meta");
329
330         mConfig.mInputLayerNames.clear();
331
332         try {
333                 for (auto &meta : inputMetaInfo) {
334                         std::shared_ptr<MetaInfo> metaInfo = meta.second;
335
336                         mConfig.mTensorInfo.ch = metaInfo->getChannel();
337                         mConfig.mTensorInfo.dim = metaInfo->dims[0];
338                         mConfig.mTensorInfo.width = metaInfo->getWidth();
339                         mConfig.mTensorInfo.height = metaInfo->getHeight();
340
341                         auto normalization =
342                                         std::static_pointer_cast<DecodingNormal>(metaInfo->decodingTypeMap[DecodingType::NORMAL]);
343                         if (normalization && normalization->use) {
344                                 mConfig.mMeanValue = normalization->mean[0];
345                                 mConfig.mStdValue = normalization->std[0];
346                         }
347
348                         mConfig.mDataType = metaInfo->dataType;
349                         mConfig.mInputLayerNames.push_back(meta.first);
350                 }
351         } catch (const std::exception &e) {
352                 LOGE("Fail to configure input meta info.");
353                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
354         }
355
356         int ret = setInputInfo();
357
358         LOGI("LEAVE");
359
360         return ret;
361 }
362
363 int Inference::setInputInfo()
364 {
365         LOGI("ENTER");
366
367         mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
368
369         inference_engine_layer_property property;
370         // In case of that a inference plugin deosn't support to get properties,
371         // the tensor info given by a user will be used.
372         // If the plugin supports that, the given info will be ignored.
373
374         for (auto &name : mConfig.mInputLayerNames) {
375                 inference_engine_tensor_info tensor_info;
376                 tensor_info.data_type = ConvertToIE(mConfig.mDataType);
377
378                 // In case of OpenCV, only supports NCHW
379                 tensor_info.shape_type = INFERENCE_TENSOR_SHAPE_NCHW;
380                 // modify to handle multiple tensor infos
381                 tensor_info.shape.push_back(mConfig.mTensorInfo.dim);
382                 tensor_info.shape.push_back(mConfig.mTensorInfo.ch);
383                 tensor_info.shape.push_back(mConfig.mTensorInfo.height);
384                 tensor_info.shape.push_back(mConfig.mTensorInfo.width);
385
386                 tensor_info.size = 1;
387                 for (auto &dim : tensor_info.shape) {
388                         tensor_info.size *= dim;
389                 }
390
391                 property.layers.insert(std::make_pair(name, tensor_info));
392         }
393
394         LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
395         LOGI("mean %.4f, deviation %.4f", mConfig.mMeanValue, mConfig.mStdValue);
396         LOGI("outputNumber %d", mConfig.mMaxOutputNumbers);
397
398         int ret = mBackend->SetInputLayerProperty(property);
399         if (ret != INFERENCE_ENGINE_ERROR_NONE)
400                 LOGE("Fail to set input layer property");
401
402         LOGI("LEAVE");
403
404         return ret;
405 }
406
407 int Inference::ConfigureOutputInfo(const std::vector<std::string> names,
408                                                                    std::vector<inference_engine_tensor_info> &tensors_info)
409 {
410         LOGI("ENTER");
411
412         inference_engine_layer_property property;
413
414         mConfig.mOutputLayerNames = names;
415
416         if (tensors_info.empty()) {
417                 inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
418                                                                                                          INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
419
420                 for (auto &name : mConfig.mOutputLayerNames) {
421                         LOGI("Configure %s layer as output", name.c_str());
422                         property.layers.insert(std::make_pair(name, tensor_info));
423                 }
424         } else {
425                 if (mConfig.mOutputLayerNames.size() != tensors_info.size()) {
426                         LOGE("Output layer count is different from tensor info count.");
427                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
428                 }
429
430                 for (size_t idx = 0; idx < mConfig.mOutputLayerNames.size(); ++idx) {
431                         LOGI("Configure %s layer as output", mConfig.mOutputLayerNames[idx].c_str());
432                         property.layers.insert(std::make_pair(mConfig.mOutputLayerNames[idx], tensors_info[idx]));
433                 }
434         }
435
436         int ret = setOutputInfo(property);
437
438         LOGI("LEAVE");
439
440         return ret;
441 }
442
443 int Inference::configureOutputMetaInfo()
444 {
445         LOGI("ENTER");
446
447         OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
448
449         mConfig.mOutputLayerNames.clear();
450
451         if (!outputMeta._tensor_info.empty()) {
452                 for (auto &info : outputMeta._tensor_info)
453                         mConfig.mOutputLayerNames.push_back(info.first);
454         }
455
456         inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
457                                                                                                  INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
458         inference_engine_layer_property property;
459
460         for (auto &name : mConfig.mOutputLayerNames) {
461                 LOGI("Configure %s layer as output", name.c_str());
462                 property.layers.insert(std::make_pair(name, tensor_info));
463         }
464
465         int ret = setOutputInfo(property);
466
467         LOGI("LEAVE");
468
469         return ret;
470 }
471
472 int Inference::configureOutputMetaInfo(MetaMap &outputMetaInfo)
473 {
474         LOGI("ENTER");
475
476         mConfig.mOutputLayerNames.clear();
477
478         try {
479                 for (auto &meta : outputMetaInfo) {
480                         std::shared_ptr<MetaInfo> &metaInfo = meta.second;
481
482                         mConfig.mDataType = metaInfo->dataType;
483                         mConfig.mOutputLayerNames.push_back(meta.first);
484                 }
485         } catch (const std::exception &e) {
486                 LOGE("Fail to configure output meta info.");
487                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
488         }
489
490         inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
491                                                                                                  INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
492         inference_engine_layer_property property;
493
494         for (auto &name : mConfig.mOutputLayerNames) {
495                 LOGI("Configure %s layer as output", name.c_str());
496                 property.layers.insert(std::make_pair(name, tensor_info));
497         }
498
499         int ret = setOutputInfo(property);
500
501         LOGI("LEAVE");
502
503         return ret;
504 }
505
506 int Inference::setOutputInfo(inference_engine_layer_property &property)
507 {
508         LOGI("ENTER");
509
510         int ret = mBackend->SetOutputLayerProperty(property);
511         if (ret != INFERENCE_ENGINE_ERROR_NONE)
512                 LOGE("Fail to set output layer property");
513
514         LOGI("LEAVE");
515
516         return ret;
517 }
518
519 int Inference::CheckBackendType(const mv_inference_backend_type_e backendType)
520 {
521         // Check if a given backend type is valid or not.
522         if (backendType <= MV_INFERENCE_BACKEND_NONE || backendType >= MV_INFERENCE_BACKEND_MAX) {
523                 LOGE("Invalid backend type.");
524                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
525         }
526
527         std::pair<std::string, bool> backend = mSupportedInferenceBackend[backendType];
528         if (backend.second == false) {
529                 LOGE("%s type is not supported", (backend.first).c_str());
530                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
531         }
532
533         LOGI("backend engine : %d", backendType);
534
535         return MEDIA_VISION_ERROR_NONE;
536 }
537
538 int Inference::ConfigureTargetTypes(int targetType, bool isNewVersion)
539 {
540         if (isNewVersion) {
541                 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetType || MV_INFERENCE_TARGET_DEVICE_MAX <= targetType) {
542                         LOGE("Invalid target device.");
543                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
544                 }
545         } else {
546                 if (MV_INFERENCE_TARGET_NONE >= targetType || MV_INFERENCE_TARGET_MAX <= targetType) {
547                         LOGE("Invalid target device.");
548                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
549                 }
550
551                 LOGI("Before converting target types : %d", targetType);
552
553                 // Convert old type to new one.
554                 switch (targetType) {
555                 case MV_INFERENCE_TARGET_CPU:
556                         targetType = MV_INFERENCE_TARGET_DEVICE_CPU;
557                         break;
558                 case MV_INFERENCE_TARGET_GPU:
559
560                         targetType = MV_INFERENCE_TARGET_DEVICE_GPU;
561                         break;
562                 case MV_INFERENCE_TARGET_CUSTOM:
563                         targetType = MV_INFERENCE_TARGET_DEVICE_CUSTOM;
564                         break;
565                 }
566
567                 LOGI("After converting target types : %d", targetType);
568         }
569
570         mConfig.mTargetTypes = targetType;
571
572         return MEDIA_VISION_ERROR_NONE;
573 }
574
575 int Inference::ConfigureTargetDevices(const int targetDevices)
576 {
577         // Check if given target types are valid or not.
578         if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetDevices || MV_INFERENCE_TARGET_DEVICE_MAX <= targetDevices) {
579                 LOGE("Invalid target device.");
580                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
581         }
582
583         LOGI("target devices : %d", targetDevices);
584
585         if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
586                 LOGE("Backend doesn't support a given device acceleration.");
587                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
588         }
589
590         mConfig.mTargetTypes = targetDevices;
591
592         return MEDIA_VISION_ERROR_NONE;
593 }
594
595 bool Inference::IsTargetDeviceSupported(const int targetDevices)
596 {
597         if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
598                 LOGE("Backend doesn't support a given %x device acceleration.", targetDevices);
599                 return false;
600         }
601
602         return true;
603 }
604
605 void Inference::ConfigureOutput(const int maxOutputNumbers)
606 {
607         mConfig.mMaxOutputNumbers =
608                         std::max(std::min(maxOutputNumbers, MV_INFERENCE_OUTPUT_NUMBERS_MAX), MV_INFERENCE_OUTPUT_NUMBERS_MIN);
609 }
610
611 void Inference::ConfigureThreshold(const double threshold)
612 {
613         mConfig.mConfidenceThresHold =
614                         std::max(std::min(threshold, MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX), MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN);
615 }
616
617 int Inference::ParseMetadata(const std::string filePath)
618 {
619         LOGI("ENTER");
620         int ret = mMetadata.Init(filePath);
621         if (ret != MEDIA_VISION_ERROR_NONE) {
622                 LOGE("Fail to init metadata[%d]", ret);
623                 return ret;
624         }
625
626         ret = mMetadata.Parse();
627         if (ret != MEDIA_VISION_ERROR_NONE) {
628                 LOGE("Fail to parse metadata[%d]", ret);
629                 return ret;
630         }
631
632         LOGI("LEAVE");
633
634         return MEDIA_VISION_ERROR_NONE;
635 }
636
637 void Inference::CleanupTensorBuffers(void)
638 {
639         LOGI("ENTER");
640
641         if (!mInputTensorBuffers.empty()) {
642                 mInputTensorBuffers.release();
643         }
644
645         if (!mOutputTensorBuffers.empty()) {
646                 mOutputTensorBuffers.release();
647         }
648
649         LOGI("LEAVE");
650 }
651
652 int Inference::PrepareTenosrBuffers(void)
653 {
654         // If there are input and output tensor buffers allocated before then release the buffers.
655         // They will be allocated again according to a new model file to be loaded.
656         CleanupTensorBuffers();
657
658         // IF model file is loaded again then the model type could be different so
659         // clean up input and output layer properties so that they can be updated again
660         // after reloading the model file.
661         if (!mInputLayerProperty.layers.empty()) {
662                 mInputLayerProperty.layers.clear();
663                 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
664         }
665         if (!mOutputLayerProperty.layers.empty()) {
666                 mOutputLayerProperty.layers.clear();
667                 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
668         }
669
670         // Get input tensor buffers from a backend engine if the backend engine allocated.
671         auto &inputTensorBuffers = mInputTensorBuffers.getIETensorBuffer();
672         int ret = mBackend->GetInputTensorBuffers(inputTensorBuffers);
673         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
674                 LOGE("Fail to get input tensor buffers from backend engine.");
675                 return ConvertEngineErrorToVisionError(ret);
676         }
677
678         ret = mBackend->GetInputLayerProperty(mInputLayerProperty);
679         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
680                 LOGE("Fail to get input layer property from backend engine.");
681                 return ConvertEngineErrorToVisionError(ret);
682         }
683
684         // If the backend engine isn't able to allocate input tensor buffers internally,
685         // then allocate the buffers at here.
686         if (mInputTensorBuffers.empty()) {
687                 for (auto &layer : mInputLayerProperty.layers) {
688                         inference_engine_tensor_buffer tensor_buffer;
689
690                         ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
691                         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
692                                 LOGE("Fail to allocate tensor buffer.");
693                                 mInputTensorBuffers.release();
694                                 return ret;
695                         }
696
697                         mInputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
698                 }
699         }
700
701         LOGI("Input tensor buffer count is %zu", mInputTensorBuffers.size());
702
703         // Get output tensor buffers from a backend engine if the backend engine allocated.
704         auto &outputTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
705         ret = mBackend->GetOutputTensorBuffers(outputTensorBuffers);
706         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
707                 LOGE("Fail to get output tensor buffers from backend engine.");
708                 return ConvertEngineErrorToVisionError(ret);
709         }
710
711         ret = mBackend->GetOutputLayerProperty(mOutputLayerProperty);
712         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
713                 LOGE("Fail to get output layer property from backend engine.");
714                 return ConvertEngineErrorToVisionError(ret);
715         }
716
717         // If the backend engine isn't able to allocate output tensor buffers internally,
718         // then allocate the buffers at here.
719         if (mOutputTensorBuffers.empty()) {
720                 for (auto &layer : mOutputLayerProperty.layers) {
721                         inference_engine_tensor_buffer tensor_buffer;
722
723                         ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
724                         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
725                                 LOGE("Fail to allocate tensor buffer.");
726                                 mInputTensorBuffers.release();
727                                 return ret;
728                         }
729
730                         mOutputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
731                 }
732         }
733
734         LOGI("Output tensor buffer count is %zu", mOutputTensorBuffers.size());
735
736         return MEDIA_VISION_ERROR_NONE;
737 }
738
739 int Inference::ConvertOutputDataTypeToFloat()
740 {
741         IETensorBuffer &ieTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
742
743         for (auto &ieTensorBuffer : ieTensorBuffers) {
744                 auto &tensorBuffer = ieTensorBuffer.second;
745
746                 // Normalize output tensor data converting it to float type in case of quantized model.
747                 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT8) {
748                         int ret = mOutputTensorBuffers.convertToFloat<unsigned char>(&tensorBuffer);
749                         if (ret != MEDIA_VISION_ERROR_NONE) {
750                                 LOGE("Fail to convert tensor data to float type.");
751                                 return ret;
752                         }
753                 }
754
755                 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT16) {
756                         int ret = mOutputTensorBuffers.convertToFloat<unsigned short>(&tensorBuffer);
757                         if (ret != MEDIA_VISION_ERROR_NONE) {
758                                 LOGE("Fail to convert tensor data to float type.");
759                                 return ret;
760                         }
761                 }
762         }
763
764         return MEDIA_VISION_ERROR_NONE;
765 }
766
767 int Inference::Bind(int backend_type, int device_type)
768 {
769         LOGI("ENTER");
770
771         int ret = CheckBackendType(static_cast<mv_inference_backend_type_e>(backend_type));
772         if (ret != MEDIA_VISION_ERROR_NONE)
773                 return ret;
774
775         std::string backendName = mSupportedInferenceBackend[backend_type].first;
776         LOGI("backend string name: %s", backendName.c_str());
777
778         inference_engine_config config = {
779                 .backend_name = backendName,
780                 .backend_type = backend_type,
781                 // As a default, Target device is CPU. If user defined desired device type in json file
782                 // then the device type will be set by Load callback.
783                 .target_devices = device_type,
784         };
785
786         // Create a backend class object.
787         try {
788                 mBackend = new InferenceEngineCommon();
789
790 #if ENABLE_INFERENCE_PROFILER
791                 mBackend->EnableProfiler(true);
792                 mBackend->DumpProfileToFile("profile_data_" + backendName + ".txt");
793 #endif
794         } catch (const std::bad_alloc &ex) {
795                 LOGE("Fail to create backend : %s", ex.what());
796                 return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
797         }
798
799         ret = MEDIA_VISION_ERROR_NONE;
800
801         // Load configuration file if a given backend type is mlapi.
802         if (config.backend_type == MV_INFERENCE_BACKEND_MLAPI) {
803                 ret = mBackend->LoadConfigFile();
804                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
805                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
806                 }
807         }
808
809         // Bind a backend library.
810         ret = mBackend->BindBackend(&config);
811         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
812                 LOGE("Fail to bind backend library.(%d)", ret);
813                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
814         }
815
816         // Get capacity information from a backend.
817         ret = mBackend->GetBackendCapacity(&mBackendCapacity);
818         if (ret != MEDIA_VISION_ERROR_NONE) {
819                 mBackend->UnbindBackend();
820                 LOGE("Fail to get backend capacity.");
821                 return ret;
822         }
823
824         if (!IsTargetDeviceSupported(mConfig.mTargetTypes)) {
825                 mBackend->UnbindBackend();
826                 LOGE("Tried to configure invalid target types.");
827                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
828         }
829
830         LOGI("LEAVE");
831
832         return MEDIA_VISION_ERROR_NONE;
833 }
834
835 int Inference::Load(void)
836 {
837         LOGI("ENTER");
838
839         std::string label_file = mConfig.mUserFilePath;
840         size_t userFileLength = label_file.length();
841         if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
842                 LOGE("Label file path in [%s] ", label_file.c_str());
843                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
844         }
845
846         int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
847         if (ret != MEDIA_VISION_ERROR_NONE) {
848                 LOGE("Fail to load label file.");
849                 return ret;
850         }
851
852         // Check if model file is valid or not.
853         std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
854         std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
855         if (key == mModelFormats.end()) {
856                 LOGE("Invalid model file format.(ext = %s)", ext_str.c_str());
857                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
858         }
859
860         LOGI("%s model file has been detected.", ext_str.c_str());
861
862         std::vector<std::string> models;
863
864         inference_model_format_e model_format = static_cast<inference_model_format_e>(key->second);
865
866         // Push model file information to models vector properly according to detected model format.
867         switch (model_format) {
868         case INFERENCE_MODEL_CAFFE:
869         case INFERENCE_MODEL_TF:
870         case INFERENCE_MODEL_DARKNET:
871         case INFERENCE_MODEL_DLDT:
872         case INFERENCE_MODEL_ONNX:
873         case INFERENCE_MODEL_VIVANTE:
874                 models.push_back(mConfig.mWeightFilePath);
875                 models.push_back(mConfig.mConfigFilePath);
876                 break;
877         case INFERENCE_MODEL_TFLITE:
878         case INFERENCE_MODEL_TORCH:
879         case INFERENCE_MODEL_NNTRAINER:
880         case INFERENCE_MODEL_SNPE:
881                 models.push_back(mConfig.mWeightFilePath);
882                 break;
883         default:
884                 break;
885         }
886
887         // Request model loading to backend engine.
888         ret = mBackend->Load(models, model_format);
889         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
890                 LOGE("Fail to load model");
891                 mCanRun = false;
892                 std::vector<std::string>().swap(models);
893                 return ConvertEngineErrorToVisionError(ret);
894         }
895
896         std::vector<std::string>().swap(models);
897
898         // Prepare input and output tensor buffers.
899         ret = PrepareTenosrBuffers();
900         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
901                 LOGE("Fail to prepare buffer");
902                 mCanRun = false;
903                 return ret;
904         }
905
906         mCanRun = true;
907
908         LOGI("LEAVE");
909
910         return ConvertEngineErrorToVisionError(ret);
911 }
912
913 int Inference::Preprocess(std::vector<mv_source_h> &mv_sources, std::vector<cv::Mat> &cv_sources)
914 {
915         unsigned int src_idx = 0;
916
917         for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
918                 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
919                 int data_type = ConvertToCv(tensor_buffer.data_type);
920                 LayerInfo layerInfo;
921                 Options opt;
922                 mv_colorspace_e colorspace = MEDIA_VISION_COLORSPACE_INVALID;
923
924                 int ret = mv_source_get_colorspace(mv_sources[src_idx], &colorspace);
925                 if (ret != MEDIA_VISION_ERROR_NONE) {
926                         LOGE("Fail to get color space.");
927                         return ret;
928                 }
929
930                 if (mMetadata.GetInputMeta().IsParsed()) {
931                         layerInfo = mMetadata.GetInputMeta().GetLayer().at(buffer.first);
932
933                         if (!mMetadata.GetInputMeta().GetOption().empty())
934                                 opt = mMetadata.GetInputMeta().GetOption().at(buffer.first);
935                 } else {
936                         // Ps. in case of legacy way, there is no way to set model specific dequantization parameters - zero point and scale.
937                         // TODO. find a proper way for it.
938                         opt.normalization.use = true;
939                         opt.normalization.mean.assign(3, mConfig.mMeanValue);
940                         opt.normalization.std.assign(3, mConfig.mStdValue);
941
942                         layerInfo.name = buffer.first;
943                         layerInfo.dims.push_back(mConfig.mTensorInfo.dim);
944                         layerInfo.dims.push_back(mConfig.mTensorInfo.height);
945                         layerInfo.dims.push_back(mConfig.mTensorInfo.width);
946                         layerInfo.dims.push_back(mConfig.mTensorInfo.ch);
947
948                         // Ps. in case of legacy way, there is no way to use model specific color space but only fixed one.
949                         // TODO. find a proper way for it.
950                         layerInfo.colorSpace = MEDIA_VISION_COLORSPACE_RGB888;
951                         layerInfo.dataType = mConfig.mDataType;
952                         // TODO. find a proper way for setting the shape type. In case of legacy way, there is no way to change the shape type properly.
953                         //       According to a given inference engine, different shape type can be needed.
954                         layerInfo.shapeType = INFERENCE_TENSOR_SHAPE_NHWC;
955                 }
956
957                 // TODO: try-catch{} error handling
958                 ret = mPreProc.Run(cv_sources[src_idx++], colorspace, data_type, layerInfo, opt, tensor_buffer.buffer);
959                 if (ret != MEDIA_VISION_ERROR_NONE) {
960                         LOGE("Fail to run pre-process.");
961                         return ret;
962                 }
963         }
964
965         return MEDIA_VISION_ERROR_NONE;
966 }
967
968 int Inference::Run(std::vector<mv_source_h> &mvSources, std::vector<mv_rectangle_s> &rects)
969 {
970         int ret = INFERENCE_ENGINE_ERROR_NONE;
971
972         if (!mCanRun) {
973                 LOGE("Invalid to run inference");
974                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
975         }
976
977         if (mvSources.empty()) {
978                 LOGE("mvSources should contain only one cv source.");
979                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
980         }
981
982         // We are able to request Only one input data for the inference as of now.
983         if (mvSources.size() > 1) {
984                 LOGE("It allows only one mv source for the inference.");
985                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
986         }
987
988         if (!rects.empty() && rects.size() != mvSources.size()) {
989                 LOGE("mvSources.size() should be same as rects.size() if rects.empty() is false.");
990                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
991         }
992
993         if (mConfig.mTensorInfo.ch != 1 && mConfig.mTensorInfo.ch != 3) {
994                 LOGE("Channel not supported.");
995                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
996         }
997
998         std::vector<cv::Mat> cvSources;
999
1000         ret = ConvertToCvSource(mvSources, cvSources, rects);
1001         if (ret != MEDIA_VISION_ERROR_NONE) {
1002                 LOGE("Fail to convert mv source to cv source.");
1003                 return ret;
1004         }
1005
1006         // mSourceSize is original input image's size
1007         // TODO. consider multiple cv sources.
1008         mSourceSize = cvSources[0].size();
1009
1010         ret = Preprocess(mvSources, cvSources);
1011         if (ret != MEDIA_VISION_ERROR_NONE) {
1012                 LOGE("Fail to preprocess given input sources.");
1013                 return ret;
1014         }
1015
1016         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1017         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1018                 return ret;
1019
1020         return ConvertOutputDataTypeToFloat();
1021 }
1022
1023 int Inference::Run(std::vector<void *> &buffer_objs)
1024 {
1025         int ret = INFERENCE_ENGINE_ERROR_NONE;
1026
1027         if (!mCanRun) {
1028                 LOGE("Invalid to run inference");
1029                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1030         }
1031
1032         if (buffer_objs.empty()) {
1033                 LOGE("cvSources should contain only one cv source.");
1034                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1035         }
1036
1037         // We are able to request Only one input data for the inference as of now.
1038         if (buffer_objs.size() > 1) {
1039                 LOGE("It allows only one source for the inference.");
1040                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1041         }
1042
1043         if (mInputTensorBuffers.getIETensorBuffer().size() != buffer_objs.size()) {
1044                 LOGE("Raw source count is not invalid.");
1045                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1046         }
1047
1048         unsigned int buffer_idx = 0;
1049
1050         for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
1051                 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
1052                 inference_engine_tensor_buffer *buffer_obj =
1053                                 static_cast<inference_engine_tensor_buffer *>(buffer_objs[buffer_idx]);
1054
1055                 if (tensor_buffer.size != buffer_obj->size) {
1056                         LOGE("Raw buffer size is invalid.");
1057                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1058                 }
1059
1060                 LOGI("A number of tensor bytes : %zu", buffer_obj->size);
1061
1062                 memcpy(tensor_buffer.buffer, buffer_obj->buffer, tensor_buffer.size);
1063                 buffer_idx++;
1064         }
1065
1066         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1067         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1068                 return ret;
1069
1070         return ConvertOutputDataTypeToFloat();
1071 }
1072
1073 int Inference::Run()
1074 {
1075         int ret = INFERENCE_ENGINE_ERROR_NONE;
1076
1077         if (!mCanRun) {
1078                 LOGE("Invalid to run inference");
1079                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1080         }
1081
1082         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1083         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1084                 return ret;
1085
1086         return ConvertOutputDataTypeToFloat();
1087 }
1088
1089 std::pair<std::string, bool> Inference::GetSupportedInferenceBackend(int backend)
1090 {
1091         return mSupportedInferenceBackend[backend];
1092 }
1093
1094 int Inference::GetClassficationResults(ImageClassificationResults *results)
1095 {
1096         // Will contain top N results in ascending order.
1097         std::vector<std::pair<float, int> > topScore;
1098         auto threadHold = mConfig.mConfidenceThresHold;
1099         constexpr unsigned int default_top_number = 5;
1100         tensor_t outputTensorInfo;
1101
1102         // Get inference result and contain it to outputTensorInfo.
1103         int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1104         if (ret != MEDIA_VISION_ERROR_NONE) {
1105                 LOGE("Fail to get output result.");
1106                 return ret;
1107         }
1108
1109         PostProcess postProc;
1110         unsigned int classes = outputTensorInfo.dimInfo[0][1];
1111         unsigned int top_number = default_top_number;
1112
1113         if (mMetadata.GetOutputMeta().IsParsed()) {
1114                 OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1115                 std::vector<int> indexes = outputMetadata.GetScoreDimInfo().GetValidIndexAll();
1116
1117                 if (indexes.size() != 1) {
1118                         LOGE("Invalid dim size. It should be 1");
1119                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1120                 }
1121
1122                 if (!mOutputTensorBuffers.exist(outputMetadata.GetScoreName())) {
1123                         LOGE("output buffe is NULL");
1124                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1125                 }
1126
1127                 top_number = outputMetadata.GetScoreTopNumber();
1128                 threadHold = outputMetadata.GetScoreThreshold();
1129
1130                 classes = mOutputLayerProperty.layers[outputMetadata.GetScoreName()].shape[indexes[0]];
1131         }
1132
1133         postProc.ScoreClear(top_number);
1134
1135         auto *prediction = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1136
1137         LOGI("class count: %d", classes);
1138
1139         for (unsigned int idx = 0; idx < classes; ++idx) {
1140                 float value = prediction[idx];
1141
1142                 if (mMetadata.GetOutputMeta().IsParsed()) {
1143                         OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1144
1145                         if (outputMetadata.GetScoreDeQuant()) {
1146                                 value = PostProcess::dequant(value, outputMetadata.GetScoreDeQuantScale(),
1147                                                                                          outputMetadata.GetScoreDeQuantZeroPoint());
1148                         }
1149
1150                         if (outputMetadata.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID)
1151                                 value = PostProcess::sigmoid(value);
1152                 }
1153
1154                 if (value < threadHold)
1155                         continue;
1156
1157                 postProc.ScorePush(value, idx);
1158         }
1159
1160         postProc.ScorePop(topScore);
1161         results->number_of_classes = 0;
1162
1163         for (auto &score : topScore) {
1164                 LOGI("score: %.3f, threshold: %.3f", score.first, threadHold);
1165                 LOGI("idx:%d", score.second);
1166                 LOGI("classProb: %.3f", score.first);
1167
1168                 results->indices.push_back(score.second);
1169                 results->confidences.push_back(score.first);
1170                 results->names.push_back(mUserListName[score.second]);
1171                 results->number_of_classes++;
1172         }
1173
1174         LOGE("Inference: GetClassificationResults: %d\n", results->number_of_classes);
1175         return MEDIA_VISION_ERROR_NONE;
1176 }
1177
1178 int Inference::GetObjectDetectionResults(ObjectDetectionResults *results)
1179 {
1180         if (mMetadata.GetOutputMeta().IsParsed()) {
1181                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1182
1183                 // decoding type
1184                 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1185                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1186                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1187                                  outputMeta.GetScoreName().c_str());
1188                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1189                 }
1190
1191                 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1192                 if (boxIndexes.size() != 1) {
1193                         LOGE("Invalid dim size. It should be 1");
1194                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1195                 }
1196
1197                 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1198                 int numberOfObjects = 0;
1199
1200                 if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
1201                         std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1202                         if (scoreIndexes.size() != 1) {
1203                                 LOGE("Invalid dim size. It should be 1");
1204                                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1205                         }
1206                         numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1207                 }
1208
1209                 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1210                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1211                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1212                                                                  numberOfObjects);
1213
1214                 objDecoder.init();
1215                 objDecoder.decode();
1216                 results->number_of_objects = 0;
1217
1218                 auto &rLoc = results->locations;
1219
1220                 for (auto &box : objDecoder.getObjectAll()) {
1221                         results->indices.push_back(box.index);
1222                         results->names.push_back(mUserListName[box.index]);
1223                         results->confidences.push_back(box.score);
1224                         auto &bLoc = box.location;
1225
1226                         auto srcW = static_cast<double>(mSourceSize.width);
1227                         auto srcH = static_cast<double>(mSourceSize.height);
1228
1229                         auto halfW = (bLoc.x - bLoc.width * 0.5f);
1230                         auto halfH = (bLoc.y - bLoc.height * 0.5f);
1231
1232                         if (mMetadata.GetInputMeta().option.begin()->second.resizer == Resizer::LETTERBOX) {
1233                                 double dstW = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getWidth());
1234                                 double dstH = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getHeight());
1235                                 double scale = std::min(1.0, std::min(dstW / srcW, dstH / srcH));
1236                                 double padSize[] = { (dstW - (scale * srcW)) / 2.0, (dstH - (scale * srcH)) / 2.0 };
1237
1238                                 auto rect =
1239                                                 cv::Rect(static_cast<int>(std::min(srcW, std::max((halfW * dstW - padSize[0]) / scale, 0.0))),
1240                                                                  static_cast<int>(std::min(srcH, std::max((halfH * dstH - padSize[1]) / scale, 0.0))),
1241                                                                  static_cast<int>((bLoc.width * dstW) / scale + padSize[0]),
1242                                                                  static_cast<int>((bLoc.height * dstH) / scale + padSize[1]));
1243
1244                                 rect.width = (rect.x + rect.width) > srcW ? srcW - rect.x : rect.width;
1245                                 rect.height = (rect.y + rect.height) > srcH ? srcH - rect.y : rect.height;
1246
1247                                 rLoc.push_back(rect);
1248                         } else {
1249                                 rLoc.push_back(cv::Rect(halfW * srcW, halfH * srcH, bLoc.width * srcW, bLoc.height * srcH));
1250                         }
1251                         results->number_of_objects++;
1252                 }
1253
1254                 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1255         } else {
1256                 tensor_t outputTensorInfo;
1257
1258                 // Get inference result and contain it to outputTensorInfo.
1259                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1260                 if (ret != MEDIA_VISION_ERROR_NONE) {
1261                         LOGE("Fail to get output result.");
1262                         return ret;
1263                 }
1264
1265                 // In case of object detection,
1266                 // a model may apply post-process but others may not.
1267                 // Thus, those cases should be hanlded separately.
1268
1269                 float *boxes = nullptr;
1270                 float *classes = nullptr;
1271                 float *scores = nullptr;
1272                 int number_of_detections = 0;
1273
1274                 if (outputTensorInfo.dimInfo.size() == 1) {
1275                         // there is no way to know how many objects are detect unless the number of objects aren't
1276                         // provided. In the case, each backend should provide the number of results manually.
1277                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1278                         // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1279                         // indicates the image id. But it is useless if a batch mode isn't supported.
1280                         // So, use the 1st of 7.
1281
1282                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1283                         cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1284                                                                  outputTensorInfo.data[0]);
1285
1286                         // boxes
1287                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1288                         cv::Mat cvTop = cvOutputData.col(4).clone();
1289                         cv::Mat cvRight = cvOutputData.col(5).clone();
1290                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1291                         cv::Mat cvScores, cvClasses, cvBoxes;
1292                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1293
1294                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1295
1296                         // classes
1297                         cvClasses = cvOutputData.col(1).clone();
1298
1299                         // scores
1300                         cvScores = cvOutputData.col(2).clone();
1301
1302                         boxes = cvBoxes.ptr<float>(0);
1303                         classes = cvClasses.ptr<float>(0);
1304                         scores = cvScores.ptr<float>(0);
1305                 } else {
1306                         boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1307                         classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1308                         scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1309                         number_of_detections = (int) (*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1310                 }
1311
1312                 LOGI("number_of_detections = %d", number_of_detections);
1313
1314                 results->number_of_objects = 0;
1315
1316                 for (int idx = 0; idx < number_of_detections; ++idx) {
1317                         if (scores[idx] < mConfig.mConfidenceThresHold)
1318                                 continue;
1319
1320                         int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1321                         int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1322                         int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1323                         int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1324                         cv::Rect loc;
1325
1326                         loc.x = left;
1327                         loc.y = top;
1328                         loc.width = right - left + 1;
1329                         loc.height = bottom - top + 1;
1330
1331                         results->indices.push_back(static_cast<int>(classes[idx]));
1332                         results->confidences.push_back(scores[idx]);
1333                         results->names.push_back(mUserListName[static_cast<int>(classes[idx])]);
1334                         results->locations.push_back(loc);
1335                         results->number_of_objects++;
1336
1337                         LOGI("objectClass: %d", static_cast<int>(classes[idx]));
1338                         LOGI("confidence:%f", scores[idx]);
1339                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1340                 }
1341
1342                 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1343         }
1344
1345         return MEDIA_VISION_ERROR_NONE;
1346 }
1347
1348 int Inference::GetFaceDetectionResults(FaceDetectionResults *results)
1349 {
1350         if (mMetadata.GetOutputMeta().IsParsed()) {
1351                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1352
1353                 // decoding type
1354                 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1355                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1356                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1357                                  outputMeta.GetScoreName().c_str());
1358                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1359                 }
1360
1361                 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1362                 if (boxIndexes.size() != 1) {
1363                         LOGE("Invalid dim size. It should be 1");
1364                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1365                 }
1366
1367                 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1368                 int numberOfFaces = 0;
1369
1370                 if (outputMeta.GetBoxDecodingType() != INFERENCE_BOX_DECODING_TYPE_BYPASS) {
1371                         std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1372                         if (scoreIndexes.size() != 1) {
1373                                 LOGE("Invaid dim size. It should be 1");
1374                                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1375                         }
1376                         numberOfFaces = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1377                 }
1378
1379                 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1380                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1381                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1382                                                                  numberOfFaces);
1383
1384                 objDecoder.init();
1385                 objDecoder.decode();
1386                 results->number_of_faces = 0;
1387
1388                 for (auto &face : objDecoder.getObjectAll()) {
1389                         results->confidences.push_back(face.score);
1390                         results->locations.push_back(
1391                                         cv::Rect(static_cast<int>((face.location.x - face.location.width * 0.5f) *
1392                                                                                           static_cast<float>(mSourceSize.width)),
1393                                                          static_cast<int>((face.location.y - face.location.height * 0.5f) *
1394                                                                                           static_cast<float>(mSourceSize.height)),
1395                                                          static_cast<int>(face.location.width * static_cast<float>(mSourceSize.width)),
1396                                                          static_cast<int>(face.location.height * static_cast<float>(mSourceSize.height))));
1397                         results->number_of_faces++;
1398                 }
1399
1400                 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1401         } else {
1402                 tensor_t outputTensorInfo;
1403
1404                 // Get inference result and contain it to outputTensorInfo.
1405                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1406                 if (ret != MEDIA_VISION_ERROR_NONE) {
1407                         LOGE("Fail to get output result.");
1408                         return ret;
1409                 }
1410
1411                 // In case of object detection,
1412                 // a model may apply post-process but others may not.
1413                 // Thus, those cases should be handled separately.
1414
1415                 float *boxes = nullptr;
1416                 float *classes = nullptr;
1417                 float *scores = nullptr;
1418                 int number_of_detections = 0;
1419                 cv::Mat cvScores, cvClasses, cvBoxes;
1420
1421                 if (outputTensorInfo.dimInfo.size() == 1) {
1422                         // there is no way to know how many objects are detect unless the number of objects aren't
1423                         // provided. In the case, each backend should provide the number of results manually.
1424                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1425                         // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1426                         // indicates the image id. But it is useless if a batch mode isn't supported.
1427                         // So, use the 1st of 7.
1428
1429                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1430                         cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1431                                                                  outputTensorInfo.data[0]);
1432
1433                         // boxes
1434                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1435                         cv::Mat cvTop = cvOutputData.col(4).clone();
1436                         cv::Mat cvRight = cvOutputData.col(5).clone();
1437                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1438                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1439                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1440
1441                         // classes
1442                         cvClasses = cvOutputData.col(1).clone();
1443
1444                         // scores
1445                         cvScores = cvOutputData.col(2).clone();
1446
1447                         boxes = cvBoxes.ptr<float>(0);
1448                         classes = cvClasses.ptr<float>(0);
1449                         scores = cvScores.ptr<float>(0);
1450                 } else {
1451                         boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1452                         classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1453                         scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1454                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1455                 }
1456
1457                 results->number_of_faces = 0;
1458
1459                 for (int idx = 0; idx < number_of_detections; ++idx) {
1460                         if (scores[idx] < mConfig.mConfidenceThresHold)
1461                                 continue;
1462
1463                         int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1464                         int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1465                         int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1466                         int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1467                         cv::Rect loc;
1468
1469                         loc.x = left;
1470                         loc.y = top;
1471                         loc.width = right - left + 1;
1472                         loc.height = bottom - top + 1;
1473                         results->confidences.push_back(scores[idx]);
1474                         results->locations.push_back(loc);
1475                         results->number_of_faces++;
1476
1477                         LOGI("confidence:%f", scores[idx]);
1478                         LOGI("class: %f", classes[idx]);
1479                         LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx * 4 + 1], boxes[idx * 4 + 0], boxes[idx * 4 + 3],
1480                                  boxes[idx * 4 + 2]);
1481                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1482                 }
1483
1484                 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1485         }
1486
1487         return MEDIA_VISION_ERROR_NONE;
1488 }
1489
1490 int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *results)
1491 {
1492         LOGI("ENTER");
1493
1494         if (mMetadata.GetOutputMeta().IsParsed()) {
1495                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1496
1497                 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1498                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1499                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1500                                  outputMeta.GetScoreName().c_str());
1501                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1502                 }
1503
1504                 int heatMapWidth = 0;
1505                 int heatMapHeight = 0;
1506                 int heatMapChannel = 0;
1507                 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1508                 int number_of_landmarks = heatMapChannel;
1509
1510                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
1511                         LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
1512                         number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1513                                                                   outputMeta.GetLandmarkOffset();
1514                 } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
1515                         number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1516                 } else {
1517                         heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1518                                                                    .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1519                         heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1520                                                                         .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1521                         heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1522                                                                          .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1523                 }
1524
1525                 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1526
1527                 // decoding
1528                 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1529                                                                 number_of_landmarks);
1530
1531                 // initialize decorder queue with landmarks to be decoded.
1532                 int ret = poseDecoder.init();
1533                 if (ret != MEDIA_VISION_ERROR_NONE) {
1534                         LOGE("Fail to init poseDecoder");
1535                         return ret;
1536                 }
1537
1538                 float inputW = 1.f;
1539                 float inputH = 1.f;
1540
1541                 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1542                         inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1543                         inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1544                 }
1545
1546                 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1547                                                                         0.0 :
1548                                                                         outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1549
1550                 poseDecoder.decode(inputW, inputH, thresRadius);
1551
1552                 for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
1553                         results->locations.push_back(
1554                                         cv::Point(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
1555                                                           poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height)));
1556                 }
1557
1558                 results->number_of_landmarks = results->locations.size();
1559         } else {
1560                 tensor_t outputTensorInfo;
1561
1562                 // Get inference result and contain it to outputTensorInfo.
1563                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1564                 if (ret != MEDIA_VISION_ERROR_NONE) {
1565                         LOGE("Fail to get output result.");
1566                         return ret;
1567                 }
1568
1569                 int number_of_detections = outputTensorInfo.dimInfo[0][1] >> 1;
1570
1571                 results->number_of_landmarks = number_of_detections;
1572                 results->locations.resize(number_of_detections);
1573
1574                 LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
1575
1576                 float *loc = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1577
1578                 for (auto &point : results->locations) {
1579                         point.x = static_cast<int>(*loc++ * mSourceSize.width);
1580                         point.y = static_cast<int>(*loc++ * mSourceSize.height);
1581
1582                         LOGI("x:%d, y:%d", point.x, point.y);
1583                 }
1584         }
1585
1586         LOGI("Inference: FacialLandmarkDetectionResults: %d\n", results->number_of_landmarks);
1587         return MEDIA_VISION_ERROR_NONE;
1588 }
1589
1590 int Inference::GetPoseLandmarkDetectionResults(std::unique_ptr<mv_inference_pose_s> &detectionResults, int width,
1591                                                                                            int height)
1592 {
1593         LOGI("ENTER");
1594
1595         auto poseResult = std::make_unique<mv_inference_pose_s>();
1596
1597         if (mMetadata.GetOutputMeta().IsParsed()) {
1598                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1599
1600                 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1601                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1602                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1603                                  outputMeta.GetScoreName().c_str());
1604                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1605                 }
1606
1607                 int heatMapWidth = 0;
1608                 int heatMapHeight = 0;
1609                 int heatMapChannel = 0;
1610
1611                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
1612                         outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
1613                         heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1614                                                                    .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1615                         heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1616                                                                         .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1617                         heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1618                                                                          .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1619                 }
1620
1621                 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1622
1623                 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1624
1625                 // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
1626                 // the landmarkChannel is guessed from the shape of the landmark output tensor.
1627                 // Otherwise, it is guessed from the heatMapChannel. (heatMapChannel is used in default).
1628                 int landmarkChannel = heatMapChannel;
1629
1630                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS)
1631                         landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1632                                                           outputMeta.GetLandmarkOffset();
1633                 else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL)
1634                         landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1635
1636                 poseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
1637                                                                                                                                                    static_cast<int>(mUserListName.size());
1638
1639                 LOGE("number of landmarks per pose: %d", poseResult->number_of_landmarks_per_pose);
1640
1641                 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1642                         LOGE("Exceeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1643                                  MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1644                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1645                 }
1646
1647                 // decoding
1648                 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1649                                                                 poseResult->number_of_landmarks_per_pose);
1650
1651                 // initialize decorder queue with landmarks to be decoded.
1652                 int ret = poseDecoder.init();
1653                 if (ret != MEDIA_VISION_ERROR_NONE) {
1654                         LOGE("Fail to init poseDecoder");
1655                         return ret;
1656                 }
1657
1658                 float inputW = 1.f;
1659                 float inputH = 1.f;
1660                 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1661                                                                         0.0 :
1662                                                                         outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1663                 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1664                         inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1665                         inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1666                 }
1667
1668                 poseDecoder.decode(inputW, inputH, thresRadius);
1669                 poseResult->number_of_poses = poseDecoder.getNumberOfPose();
1670
1671                 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1672                         for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; ++landmarkIndex) {
1673                                 int part = landmarkIndex;
1674                                 if (!mUserListName.empty()) {
1675                                         part = std::stoi(mUserListName[landmarkIndex]) - 1;
1676                                         if (part < 0) {
1677                                                 continue;
1678                                         }
1679                                 }
1680
1681                                 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1682                                 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1683                                                 poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
1684                                 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1685                                                 poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
1686                                 poseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
1687                                 poseResult->landmarks[poseIndex][landmarkIndex].score = poseDecoder.getScore(poseIndex, part);
1688                         }
1689                 }
1690
1691                 detectionResults = std::move(poseResult);
1692         } else {
1693                 tensor_t outputTensorInfo;
1694
1695                 // Get inference result and contain it to outputTensorInfo.
1696                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1697                 if (ret != MEDIA_VISION_ERROR_NONE) {
1698                         LOGE("Fail to get output result.");
1699                         return ret;
1700                 }
1701
1702                 cv::Mat reShapeTest(cv::Size(outputTensorInfo.dimInfo[0][2], outputTensorInfo.dimInfo[0][1]),
1703                                                         CV_32FC(outputTensorInfo.dimInfo[0][3]), outputTensorInfo.data[0]);
1704                 cv::Mat multiChannels[outputTensorInfo.dimInfo[0][3]];
1705
1706                 split(reShapeTest, multiChannels);
1707
1708                 float ratioX = static_cast<float>(outputTensorInfo.dimInfo[0][2]);
1709                 float ratioY = static_cast<float>(outputTensorInfo.dimInfo[0][1]);
1710
1711                 poseResult->number_of_poses = 1;
1712                 poseResult->number_of_landmarks_per_pose = outputTensorInfo.dimInfo[0][3];
1713
1714                 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1715                         LOGE("Exeeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1716                                  MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1717                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1718                 }
1719
1720                 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1721                         for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; landmarkIndex++) {
1722                                 int part = landmarkIndex;
1723                                 if (!mUserListName.empty()) {
1724                                         part = std::stoi(mUserListName[landmarkIndex]) - 1;
1725                                         if (part < 0) {
1726                                                 continue;
1727                                         }
1728                                 }
1729
1730                                 cv::Mat heatMap = multiChannels[part];
1731                                 double score;
1732                                 cv::Point loc;
1733                                 cv::Point2f loc2f;
1734                                 cv::Mat blurredHeatMap;
1735
1736                                 cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
1737                                 cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
1738
1739                                 loc2f.x = (static_cast<float>(loc.x) / ratioX);
1740                                 loc2f.y = (static_cast<float>(loc.y) / ratioY);
1741
1742                                 LOGI("landmarkIndex[%2d] - mapping to [%2d]: x[%.3f], y[%.3f], score[%.3f]", landmarkIndex, part,
1743                                          loc2f.x, loc2f.y, score);
1744
1745                                 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1746                                 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1747                                                 static_cast<int>(static_cast<float>(width) * loc2f.x);
1748                                 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1749                                                 static_cast<int>(static_cast<float>(height) * loc2f.y);
1750                                 poseResult->landmarks[poseIndex][landmarkIndex].score = score;
1751                                 poseResult->landmarks[poseIndex][landmarkIndex].label = -1;
1752                         }
1753                 }
1754
1755                 detectionResults = std::move(poseResult);
1756         }
1757
1758         return MEDIA_VISION_ERROR_NONE;
1759 }
1760
1761 } /* Inference */
1762 } /* MediaVision */