mv_machine_learning/inference/src/Inference.cpp

   1 /**
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "mv_private.h"
  18 #include "Inference.h"
  19 #include "InferenceIni.h"
  20 #include "ObjectDecoder.h"
  21 #include "PoseDecoder.h"
  22 #include "util.h"
  23 #include <map>
  24 #include <list>
  25
  26 #include <unistd.h>
  27 #include <fstream>
  28 #include <string>
  29 #include <queue>
  30 #include <algorithm>
  31
  32 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
  33 #define MV_INFERENCE_OUTPUT_NUMBERS_MIN 1
  34 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
  35 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
  36
  37 typedef enum {
  38         InputAttrNoType = 0,
  39         InputAttrFloat32 = 1,
  40         InputAttrInt32 = 2,
  41         InputAttrUInt8 = 3,
  42         InputAttrInt64 = 4,
  43         InputAttrString = 5,
  44         InputAttrBool = 6,
  45 } InputAttrType;
  46
  47 using namespace mediavision::common::util;
  48 using namespace mediavision::machine_learning;
  49
  50 namespace mediavision
  51 {
  52 namespace inference
  53 {
  54 InferenceConfig::InferenceConfig()
  55                 : mConfigFilePath()
  56                 , mWeightFilePath()
  57                 , mUserFilePath()
  58                 , mDataType(MV_INFERENCE_DATA_FLOAT32)
  59                 , mTargetTypes(MV_INFERENCE_TARGET_DEVICE_CPU)
  60                 , mConfidenceThresHold()
  61                 , mMeanValue()
  62                 , mStdValue()
  63                 , mMaxOutputNumbers(1)
  64 {
  65         mTensorInfo.width = -1;
  66         mTensorInfo.height = -1;
  67         mTensorInfo.dim = -1;
  68         mTensorInfo.ch = -1;
  69 }
  70
  71 Inference::Inference()
  72 {
  73         LOGI("ENTER");
  74
  75         CheckSupportedInferenceBackend();
  76
  77         for (auto &backend : mSupportedInferenceBackend) {
  78                 LOGI("%s: %s", backend.second.first.c_str(), backend.second.second ? "TRUE" : "FALSE");
  79         }
  80         LOGI("LEAVE");
  81 }
  82
  83 Inference::~Inference()
  84 {
  85         CleanupTensorBuffers();
  86
  87         if (!mInputLayerProperty.layers.empty()) {
  88                 mInputLayerProperty.layers.clear();
  89                 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
  90         }
  91         if (!mOutputLayerProperty.layers.empty()) {
  92                 mOutputLayerProperty.layers.clear();
  93                 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
  94         }
  95
  96         mModelFormats.clear();
  97
  98         // Release backend engine.
  99         if (mBackend) {
 100                 mBackend->UnbindBackend();
 101                 delete mBackend;
 102         }
 103
 104         LOGI("Released backend engine.");
 105 }
 106
 107 void Inference::CheckSupportedInferenceBackend()
 108 {
 109         LOGI("ENTER");
 110
 111         InferenceInI ini;
 112         ini.LoadInI();
 113
 114         std::vector<int> supportedBackend = ini.GetSupportedInferenceEngines();
 115         for (auto &backend : supportedBackend) {
 116                 LOGI("engine: %d", backend);
 117
 118                 mSupportedInferenceBackend[backend].second = true;
 119         }
 120
 121         LOGI("LEAVE");
 122 }
 123
 124 int Inference::ConvertEngineErrorToVisionError(int error)
 125 {
 126         int ret = MEDIA_VISION_ERROR_NONE;
 127
 128         switch (error) {
 129         case INFERENCE_ENGINE_ERROR_NONE:
 130                 ret = MEDIA_VISION_ERROR_NONE;
 131                 break;
 132         case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED:
 133                 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED;
 134                 break;
 135         case INFERENCE_ENGINE_ERROR_MSG_TOO_LONG:
 136                 ret = MEDIA_VISION_ERROR_MSG_TOO_LONG;
 137                 break;
 138         case INFERENCE_ENGINE_ERROR_NO_DATA:
 139                 ret = MEDIA_VISION_ERROR_NO_DATA;
 140                 break;
 141         case INFERENCE_ENGINE_ERROR_KEY_NOT_AVAILABLE:
 142                 ret = MEDIA_VISION_ERROR_KEY_NOT_AVAILABLE;
 143                 break;
 144         case INFERENCE_ENGINE_ERROR_OUT_OF_MEMORY:
 145                 ret = MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 146                 break;
 147         case INFERENCE_ENGINE_ERROR_INVALID_PARAMETER:
 148                 ret = MEDIA_VISION_ERROR_INVALID_PARAMETER;
 149                 break;
 150         case INFERENCE_ENGINE_ERROR_INVALID_OPERATION:
 151                 ret = MEDIA_VISION_ERROR_INVALID_OPERATION;
 152                 break;
 153         case INFERENCE_ENGINE_ERROR_PERMISSION_DENIED:
 154                 ret = MEDIA_VISION_ERROR_PERMISSION_DENIED;
 155                 break;
 156         case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED_FORMAT:
 157                 ret = MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
 158                 break;
 159         case INFERENCE_ENGINE_ERROR_INTERNAL:
 160                 ret = MEDIA_VISION_ERROR_INTERNAL;
 161                 break;
 162         case INFERENCE_ENGINE_ERROR_INVALID_DATA:
 163                 ret = MEDIA_VISION_ERROR_INVALID_DATA;
 164                 break;
 165         case INFERENCE_ENGINE_ERROR_INVALID_PATH:
 166                 ret = MEDIA_VISION_ERROR_INVALID_PATH;
 167                 break;
 168         default:
 169                 LOGE("Unknown inference engine error type");
 170         }
 171
 172         return ret;
 173 }
 174
 175 int Inference::ConvertTargetTypes(int given_types)
 176 {
 177         int target_types = INFERENCE_TARGET_NONE;
 178
 179         if (given_types & MV_INFERENCE_TARGET_DEVICE_CPU)
 180                 target_types |= INFERENCE_TARGET_CPU;
 181         if (given_types & MV_INFERENCE_TARGET_DEVICE_GPU)
 182                 target_types |= INFERENCE_TARGET_GPU;
 183         if (given_types & MV_INFERENCE_TARGET_DEVICE_CUSTOM)
 184                 target_types |= INFERENCE_TARGET_CUSTOM;
 185
 186         return target_types;
 187 }
 188
 189 int Inference::ConvertToCv(int given_type)
 190 {
 191         int type = 0;
 192         const int ch = mConfig.mTensorInfo.ch;
 193
 194         switch (given_type) {
 195         case INFERENCE_TENSOR_DATA_TYPE_UINT8:
 196                 LOGI("Type is %d ch with UINT8", ch);
 197                 type = ch == 1 ? CV_8UC1 : CV_8UC3;
 198                 break;
 199         case INFERENCE_TENSOR_DATA_TYPE_FLOAT32:
 200                 LOGI("Type is %d ch with FLOAT32", ch);
 201                 type = ch == 1 ? CV_32FC1 : CV_32FC3;
 202                 break;
 203         default:
 204                 LOGI("unknown data type so FLOAT32 data type will be used in default");
 205                 type = ch == 1 ? CV_32FC1 : CV_32FC3;
 206                 break;
 207         }
 208
 209         return type;
 210 }
 211
 212 inference_tensor_data_type_e Inference::ConvertToIE(int given_type)
 213 {
 214         inference_tensor_data_type_e type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
 215
 216         switch (given_type) {
 217         case MV_INFERENCE_DATA_FLOAT32:
 218                 type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
 219                 break;
 220         case MV_INFERENCE_DATA_UINT8:
 221                 type = INFERENCE_TENSOR_DATA_TYPE_UINT8;
 222                 break;
 223         default:
 224                 LOGI("unknown data type so FLOAT32 data type will be used in default");
 225                 break;
 226         }
 227
 228         return type;
 229 }
 230
 231 int Inference::SetUserFile(std::string filename)
 232 {
 233         std::ifstream fp(filename.c_str());
 234         if (!fp.is_open()) {
 235                 return MEDIA_VISION_ERROR_INVALID_PATH;
 236         }
 237
 238         std::string userListName;
 239         while (!fp.eof()) {
 240                 std::getline(fp, userListName);
 241                 if (userListName.length())
 242                         mUserListName.push_back(userListName);
 243         }
 244
 245         fp.close();
 246
 247         return MEDIA_VISION_ERROR_NONE;
 248 }
 249
 250 void Inference::ConfigureModelFiles(const std::string modelConfigFilePath, const std::string modelWeightFilePath,
 251                                                                         const std::string modelUserFilePath)
 252 {
 253         LOGI("ENTER");
 254
 255         mConfig.mConfigFilePath = modelConfigFilePath;
 256         mConfig.mWeightFilePath = modelWeightFilePath;
 257         mConfig.mUserFilePath = modelUserFilePath;
 258
 259         LOGI("LEAVE");
 260 }
 261
 262 int Inference::ConfigureInputInfo(int width, int height, int dim, int ch, double stdValue, double meanValue,
 263                                                                   int dataType, const std::vector<std::string> names)
 264 {
 265         LOGI("ENTER");
 266
 267         // FIXME: mConfig should be removed
 268         mConfig.mTensorInfo = { width, height, dim, ch };
 269         mConfig.mStdValue = stdValue;
 270         mConfig.mMeanValue = meanValue;
 271         mConfig.mDataType = static_cast<mv_inference_data_type_e>(dataType);
 272         mConfig.mInputLayerNames = names;
 273
 274         int ret = setInputInfo();
 275
 276         LOGI("LEAVE");
 277
 278         return ret;
 279 }
 280
 281 int Inference::configureInputMetaInfo()
 282 {
 283         LOGI("ENTER");
 284
 285         LOGI("use input meta");
 286
 287         auto &layerInfo = mMetadata.GetInputMeta().GetLayer().begin()->second;
 288
 289         if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NCHW) { // NCHW
 290                 mConfig.mTensorInfo.ch = layerInfo.dims[1];
 291                 mConfig.mTensorInfo.dim = layerInfo.dims[0];
 292                 mConfig.mTensorInfo.width = layerInfo.dims[3];
 293                 mConfig.mTensorInfo.height = layerInfo.dims[2];
 294         } else if (layerInfo.shapeType == INFERENCE_TENSOR_SHAPE_NHWC) { // NHWC
 295                 mConfig.mTensorInfo.ch = layerInfo.dims[3];
 296                 mConfig.mTensorInfo.dim = layerInfo.dims[0];
 297                 mConfig.mTensorInfo.width = layerInfo.dims[2];
 298                 mConfig.mTensorInfo.height = layerInfo.dims[1];
 299         } else {
 300                 LOGE("Invalid shape type[%d]", layerInfo.shapeType);
 301         }
 302
 303         if (!mMetadata.GetInputMeta().GetOption().empty()) {
 304                 auto &option = mMetadata.GetInputMeta().GetOption().begin()->second;
 305                 if (option.normalization.use) {
 306                         mConfig.mMeanValue = option.normalization.mean[0];
 307                         mConfig.mStdValue = option.normalization.std[0];
 308                 }
 309         }
 310
 311         mConfig.mDataType = layerInfo.dataType;
 312         mConfig.mInputLayerNames.clear();
 313
 314         for (auto &layer : mMetadata.GetInputMeta().GetLayer())
 315                 mConfig.mInputLayerNames.push_back(layer.first);
 316
 317         int ret = setInputInfo();
 318
 319         LOGI("LEAVE");
 320
 321         return ret;
 322 }
 323
 324 int Inference::configureInputMetaInfo(MetaMap &inputMetaInfo)
 325 {
 326         LOGI("ENTER");
 327
 328         LOGI("use input meta");
 329
 330         mConfig.mInputLayerNames.clear();
 331
 332         try {
 333                 for (auto &meta : inputMetaInfo) {
 334                         std::shared_ptr<MetaInfo> metaInfo = meta.second;
 335
 336                         mConfig.mTensorInfo.ch = metaInfo->getChannel();
 337                         mConfig.mTensorInfo.dim = metaInfo->dims[0];
 338                         mConfig.mTensorInfo.width = metaInfo->getWidth();
 339                         mConfig.mTensorInfo.height = metaInfo->getHeight();
 340
 341                         auto normalization =
 342                                         std::static_pointer_cast<DecodingNormal>(metaInfo->decodingTypeMap[DecodingType::NORMAL]);
 343                         if (normalization && normalization->use) {
 344                                 mConfig.mMeanValue = normalization->mean[0];
 345                                 mConfig.mStdValue = normalization->std[0];
 346                         }
 347
 348                         mConfig.mDataType = metaInfo->dataType;
 349                         mConfig.mInputLayerNames.push_back(meta.first);
 350                 }
 351         } catch (const std::exception &e) {
 352                 LOGE("Fail to configure input meta info.");
 353                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
 354         }
 355
 356         int ret = setInputInfo();
 357
 358         LOGI("LEAVE");
 359
 360         return ret;
 361 }
 362
 363 int Inference::setInputInfo()
 364 {
 365         LOGI("ENTER");
 366
 367         mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
 368
 369         inference_engine_layer_property property;
 370         // In case of that a inference plugin deosn't support to get properties,
 371         // the tensor info given by a user will be used.
 372         // If the plugin supports that, the given info will be ignored.
 373
 374         for (auto &name : mConfig.mInputLayerNames) {
 375                 inference_engine_tensor_info tensor_info;
 376                 tensor_info.data_type = ConvertToIE(mConfig.mDataType);
 377
 378                 // In case of OpenCV, only supports NCHW
 379                 tensor_info.shape_type = INFERENCE_TENSOR_SHAPE_NCHW;
 380                 // modify to handle multiple tensor infos
 381                 tensor_info.shape.push_back(mConfig.mTensorInfo.dim);
 382                 tensor_info.shape.push_back(mConfig.mTensorInfo.ch);
 383                 tensor_info.shape.push_back(mConfig.mTensorInfo.height);
 384                 tensor_info.shape.push_back(mConfig.mTensorInfo.width);
 385
 386                 tensor_info.size = 1;
 387                 for (auto &dim : tensor_info.shape) {
 388                         tensor_info.size *= dim;
 389                 }
 390
 391                 property.layers.insert(std::make_pair(name, tensor_info));
 392         }
 393
 394         LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
 395         LOGI("mean %.4f, deviation %.4f", mConfig.mMeanValue, mConfig.mStdValue);
 396         LOGI("outputNumber %d", mConfig.mMaxOutputNumbers);
 397
 398         int ret = mBackend->SetInputLayerProperty(property);
 399         if (ret != INFERENCE_ENGINE_ERROR_NONE)
 400                 LOGE("Fail to set input layer property");
 401
 402         LOGI("LEAVE");
 403
 404         return ret;
 405 }
 406
 407 int Inference::ConfigureOutputInfo(const std::vector<std::string> names,
 408                                                                    std::vector<inference_engine_tensor_info> &tensors_info)
 409 {
 410         LOGI("ENTER");
 411
 412         inference_engine_layer_property property;
 413
 414         mConfig.mOutputLayerNames = names;
 415
 416         if (tensors_info.empty()) {
 417                 inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
 418                                                                                                          INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
 419
 420                 for (auto &name : mConfig.mOutputLayerNames) {
 421                         LOGI("Configure %s layer as output", name.c_str());
 422                         property.layers.insert(std::make_pair(name, tensor_info));
 423                 }
 424         } else {
 425                 if (mConfig.mOutputLayerNames.size() != tensors_info.size()) {
 426                         LOGE("Output layer count is different from tensor info count.");
 427                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 428                 }
 429
 430                 for (size_t idx = 0; idx < mConfig.mOutputLayerNames.size(); ++idx) {
 431                         LOGI("Configure %s layer as output", mConfig.mOutputLayerNames[idx].c_str());
 432                         property.layers.insert(std::make_pair(mConfig.mOutputLayerNames[idx], tensors_info[idx]));
 433                 }
 434         }
 435
 436         int ret = setOutputInfo(property);
 437
 438         LOGI("LEAVE");
 439
 440         return ret;
 441 }
 442
 443 int Inference::configureOutputMetaInfo()
 444 {
 445         LOGI("ENTER");
 446
 447         OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
 448
 449         mConfig.mOutputLayerNames.clear();
 450
 451         if (!outputMeta._tensor_info.empty()) {
 452                 for (auto &info : outputMeta._tensor_info)
 453                         mConfig.mOutputLayerNames.push_back(info.first);
 454         }
 455
 456         inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
 457                                                                                                  INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
 458         inference_engine_layer_property property;
 459
 460         for (auto &name : mConfig.mOutputLayerNames) {
 461                 LOGI("Configure %s layer as output", name.c_str());
 462                 property.layers.insert(std::make_pair(name, tensor_info));
 463         }
 464
 465         int ret = setOutputInfo(property);
 466
 467         LOGI("LEAVE");
 468
 469         return ret;
 470 }
 471
 472 int Inference::configureOutputMetaInfo(MetaMap &outputMetaInfo)
 473 {
 474         LOGI("ENTER");
 475
 476         mConfig.mOutputLayerNames.clear();
 477
 478         try {
 479                 for (auto &meta : outputMetaInfo) {
 480                         std::shared_ptr<MetaInfo> &metaInfo = meta.second;
 481
 482                         mConfig.mDataType = metaInfo->dataType;
 483                         mConfig.mOutputLayerNames.push_back(meta.first);
 484                 }
 485         } catch (const std::exception &e) {
 486                 LOGE("Fail to configure output meta info.");
 487                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
 488         }
 489
 490         inference_engine_tensor_info tensor_info = { std::vector<size_t> { 1 }, INFERENCE_TENSOR_SHAPE_NCHW,
 491                                                                                                  INFERENCE_TENSOR_DATA_TYPE_FLOAT32, 1 };
 492         inference_engine_layer_property property;
 493
 494         for (auto &name : mConfig.mOutputLayerNames) {
 495                 LOGI("Configure %s layer as output", name.c_str());
 496                 property.layers.insert(std::make_pair(name, tensor_info));
 497         }
 498
 499         int ret = setOutputInfo(property);
 500
 501         LOGI("LEAVE");
 502
 503         return ret;
 504 }
 505
 506 int Inference::setOutputInfo(inference_engine_layer_property &property)
 507 {
 508         LOGI("ENTER");
 509
 510         int ret = mBackend->SetOutputLayerProperty(property);
 511         if (ret != INFERENCE_ENGINE_ERROR_NONE)
 512                 LOGE("Fail to set output layer property");
 513
 514         LOGI("LEAVE");
 515
 516         return ret;
 517 }
 518
 519 int Inference::CheckBackendType(const mv_inference_backend_type_e backendType)
 520 {
 521         // Check if a given backend type is valid or not.
 522         if (backendType <= MV_INFERENCE_BACKEND_NONE || backendType >= MV_INFERENCE_BACKEND_MAX) {
 523                 LOGE("Invalid backend type.");
 524                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 525         }
 526
 527         std::pair<std::string, bool> backend = mSupportedInferenceBackend[backendType];
 528         if (backend.second == false) {
 529                 LOGE("%s type is not supported", (backend.first).c_str());
 530                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
 531         }
 532
 533         LOGI("backend engine : %d", backendType);
 534
 535         return MEDIA_VISION_ERROR_NONE;
 536 }
 537
 538 int Inference::ConfigureTargetTypes(int targetType, bool isNewVersion)
 539 {
 540         if (isNewVersion) {
 541                 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetType || MV_INFERENCE_TARGET_DEVICE_MAX <= targetType) {
 542                         LOGE("Invalid target device.");
 543                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 544                 }
 545         } else {
 546                 if (MV_INFERENCE_TARGET_NONE >= targetType || MV_INFERENCE_TARGET_MAX <= targetType) {
 547                         LOGE("Invalid target device.");
 548                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 549                 }
 550
 551                 LOGI("Before converting target types : %d", targetType);
 552
 553                 // Convert old type to new one.
 554                 switch (targetType) {
 555                 case MV_INFERENCE_TARGET_CPU:
 556                         targetType = MV_INFERENCE_TARGET_DEVICE_CPU;
 557                         break;
 558                 case MV_INFERENCE_TARGET_GPU:
 559
 560                         targetType = MV_INFERENCE_TARGET_DEVICE_GPU;
 561                         break;
 562                 case MV_INFERENCE_TARGET_CUSTOM:
 563                         targetType = MV_INFERENCE_TARGET_DEVICE_CUSTOM;
 564                         break;
 565                 }
 566
 567                 LOGI("After converting target types : %d", targetType);
 568         }
 569
 570         mConfig.mTargetTypes = targetType;
 571
 572         return MEDIA_VISION_ERROR_NONE;
 573 }
 574
 575 int Inference::ConfigureTargetDevices(const int targetDevices)
 576 {
 577         // Check if given target types are valid or not.
 578         if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetDevices || MV_INFERENCE_TARGET_DEVICE_MAX <= targetDevices) {
 579                 LOGE("Invalid target device.");
 580                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 581         }
 582
 583         LOGI("target devices : %d", targetDevices);
 584
 585         if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
 586                 LOGE("Backend doesn't support a given device acceleration.");
 587                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
 588         }
 589
 590         mConfig.mTargetTypes = targetDevices;
 591
 592         return MEDIA_VISION_ERROR_NONE;
 593 }
 594
 595 bool Inference::IsTargetDeviceSupported(const int targetDevices)
 596 {
 597         if (!(mBackendCapacity.supported_accel_devices & targetDevices)) {
 598                 LOGE("Backend doesn't support a given %x device acceleration.", targetDevices);
 599                 return false;
 600         }
 601
 602         return true;
 603 }
 604
 605 void Inference::ConfigureOutput(const int maxOutputNumbers)
 606 {
 607         mConfig.mMaxOutputNumbers =
 608                         std::max(std::min(maxOutputNumbers, MV_INFERENCE_OUTPUT_NUMBERS_MAX), MV_INFERENCE_OUTPUT_NUMBERS_MIN);
 609 }
 610
 611 void Inference::ConfigureThreshold(const double threshold)
 612 {
 613         mConfig.mConfidenceThresHold =
 614                         std::max(std::min(threshold, MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX), MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN);
 615 }
 616
 617 int Inference::ParseMetadata(const std::string filePath)
 618 {
 619         LOGI("ENTER");
 620         int ret = mMetadata.Init(filePath);
 621         if (ret != MEDIA_VISION_ERROR_NONE) {
 622                 LOGE("Fail to init metadata[%d]", ret);
 623                 return ret;
 624         }
 625
 626         ret = mMetadata.Parse();
 627         if (ret != MEDIA_VISION_ERROR_NONE) {
 628                 LOGE("Fail to parse metadata[%d]", ret);
 629                 return ret;
 630         }
 631
 632         LOGI("LEAVE");
 633
 634         return MEDIA_VISION_ERROR_NONE;
 635 }
 636
 637 void Inference::CleanupTensorBuffers(void)
 638 {
 639         LOGI("ENTER");
 640
 641         if (!mInputTensorBuffers.empty()) {
 642                 mInputTensorBuffers.release();
 643         }
 644
 645         if (!mOutputTensorBuffers.empty()) {
 646                 mOutputTensorBuffers.release();
 647         }
 648
 649         LOGI("LEAVE");
 650 }
 651
 652 int Inference::PrepareTenosrBuffers(void)
 653 {
 654         // If there are input and output tensor buffers allocated before then release the buffers.
 655         // They will be allocated again according to a new model file to be loaded.
 656         CleanupTensorBuffers();
 657
 658         // IF model file is loaded again then the model type could be different so
 659         // clean up input and output layer properties so that they can be updated again
 660         // after reloading the model file.
 661         if (!mInputLayerProperty.layers.empty()) {
 662                 mInputLayerProperty.layers.clear();
 663                 std::map<std::string, inference_engine_tensor_info>().swap(mInputLayerProperty.layers);
 664         }
 665         if (!mOutputLayerProperty.layers.empty()) {
 666                 mOutputLayerProperty.layers.clear();
 667                 std::map<std::string, inference_engine_tensor_info>().swap(mOutputLayerProperty.layers);
 668         }
 669
 670         // Get input tensor buffers from a backend engine if the backend engine allocated.
 671         auto &inputTensorBuffers = mInputTensorBuffers.getIETensorBuffer();
 672         int ret = mBackend->GetInputTensorBuffers(inputTensorBuffers);
 673         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 674                 LOGE("Fail to get input tensor buffers from backend engine.");
 675                 return ConvertEngineErrorToVisionError(ret);
 676         }
 677
 678         ret = mBackend->GetInputLayerProperty(mInputLayerProperty);
 679         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 680                 LOGE("Fail to get input layer property from backend engine.");
 681                 return ConvertEngineErrorToVisionError(ret);
 682         }
 683
 684         // If the backend engine isn't able to allocate input tensor buffers internally,
 685         // then allocate the buffers at here.
 686         if (mInputTensorBuffers.empty()) {
 687                 for (auto &layer : mInputLayerProperty.layers) {
 688                         inference_engine_tensor_buffer tensor_buffer;
 689
 690                         ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
 691                         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 692                                 LOGE("Fail to allocate tensor buffer.");
 693                                 mInputTensorBuffers.release();
 694                                 return ret;
 695                         }
 696
 697                         mInputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
 698                 }
 699         }
 700
 701         LOGI("Input tensor buffer count is %zu", mInputTensorBuffers.size());
 702
 703         // Get output tensor buffers from a backend engine if the backend engine allocated.
 704         auto &outputTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
 705         ret = mBackend->GetOutputTensorBuffers(outputTensorBuffers);
 706         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 707                 LOGE("Fail to get output tensor buffers from backend engine.");
 708                 return ConvertEngineErrorToVisionError(ret);
 709         }
 710
 711         ret = mBackend->GetOutputLayerProperty(mOutputLayerProperty);
 712         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 713                 LOGE("Fail to get output layer property from backend engine.");
 714                 return ConvertEngineErrorToVisionError(ret);
 715         }
 716
 717         // If the backend engine isn't able to allocate output tensor buffers internally,
 718         // then allocate the buffers at here.
 719         if (mOutputTensorBuffers.empty()) {
 720                 for (auto &layer : mOutputLayerProperty.layers) {
 721                         inference_engine_tensor_buffer tensor_buffer;
 722
 723                         ret = mInputTensorBuffers.allocate(tensor_buffer, layer.second);
 724                         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 725                                 LOGE("Fail to allocate tensor buffer.");
 726                                 mInputTensorBuffers.release();
 727                                 return ret;
 728                         }
 729
 730                         mOutputTensorBuffers.addTensorBuffer(layer.first, tensor_buffer);
 731                 }
 732         }
 733
 734         LOGI("Output tensor buffer count is %zu", mOutputTensorBuffers.size());
 735
 736         return MEDIA_VISION_ERROR_NONE;
 737 }
 738
 739 int Inference::ConvertOutputDataTypeToFloat()
 740 {
 741         IETensorBuffer &ieTensorBuffers = mOutputTensorBuffers.getIETensorBuffer();
 742
 743         for (auto &ieTensorBuffer : ieTensorBuffers) {
 744                 auto &tensorBuffer = ieTensorBuffer.second;
 745
 746                 // Normalize output tensor data converting it to float type in case of quantized model.
 747                 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT8) {
 748                         int ret = mOutputTensorBuffers.convertToFloat<unsigned char>(&tensorBuffer);
 749                         if (ret != MEDIA_VISION_ERROR_NONE) {
 750                                 LOGE("Fail to convert tensor data to float type.");
 751                                 return ret;
 752                         }
 753                 }
 754
 755                 if (tensorBuffer.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT16) {
 756                         int ret = mOutputTensorBuffers.convertToFloat<unsigned short>(&tensorBuffer);
 757                         if (ret != MEDIA_VISION_ERROR_NONE) {
 758                                 LOGE("Fail to convert tensor data to float type.");
 759                                 return ret;
 760                         }
 761                 }
 762         }
 763
 764         return MEDIA_VISION_ERROR_NONE;
 765 }
 766
 767 int Inference::Bind(int backend_type, int device_type)
 768 {
 769         LOGI("ENTER");
 770
 771         int ret = CheckBackendType(static_cast<mv_inference_backend_type_e>(backend_type));
 772         if (ret != MEDIA_VISION_ERROR_NONE)
 773                 return ret;
 774
 775         std::string backendName = mSupportedInferenceBackend[backend_type].first;
 776         LOGI("backend string name: %s", backendName.c_str());
 777
 778         inference_engine_config config = {
 779                 .backend_name = backendName,
 780                 .backend_type = backend_type,
 781                 // As a default, Target device is CPU. If user defined desired device type in json file
 782                 // then the device type will be set by Load callback.
 783                 .target_devices = device_type,
 784         };
 785
 786         // Create a backend class object.
 787         try {
 788                 mBackend = new InferenceEngineCommon();
 789
 790 #if ENABLE_INFERENCE_PROFILER
 791                 mBackend->EnableProfiler(true);
 792                 mBackend->DumpProfileToFile("profile_data_" + backendName + ".txt");
 793 #endif
 794         } catch (const std::bad_alloc &ex) {
 795                 LOGE("Fail to create backend : %s", ex.what());
 796                 return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 797         }
 798
 799         ret = MEDIA_VISION_ERROR_NONE;
 800
 801         // Load configuration file if a given backend type is mlapi.
 802         if (config.backend_type == MV_INFERENCE_BACKEND_MLAPI) {
 803                 ret = mBackend->LoadConfigFile();
 804                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 805                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
 806                 }
 807         }
 808
 809         // Bind a backend library.
 810         ret = mBackend->BindBackend(&config);
 811         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 812                 LOGE("Fail to bind backend library.(%d)", ret);
 813                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
 814         }
 815
 816         // Get capacity information from a backend.
 817         ret = mBackend->GetBackendCapacity(&mBackendCapacity);
 818         if (ret != MEDIA_VISION_ERROR_NONE) {
 819                 mBackend->UnbindBackend();
 820                 LOGE("Fail to get backend capacity.");
 821                 return ret;
 822         }
 823
 824         if (!IsTargetDeviceSupported(mConfig.mTargetTypes)) {
 825                 mBackend->UnbindBackend();
 826                 LOGE("Tried to configure invalid target types.");
 827                 return MEDIA_VISION_ERROR_NOT_SUPPORTED;
 828         }
 829
 830         LOGI("LEAVE");
 831
 832         return MEDIA_VISION_ERROR_NONE;
 833 }
 834
 835 int Inference::Load(void)
 836 {
 837         LOGI("ENTER");
 838
 839         std::string label_file = mConfig.mUserFilePath;
 840         size_t userFileLength = label_file.length();
 841         if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
 842                 LOGE("Label file path in [%s] ", label_file.c_str());
 843                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 844         }
 845
 846         int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
 847         if (ret != MEDIA_VISION_ERROR_NONE) {
 848                 LOGE("Fail to load label file.");
 849                 return ret;
 850         }
 851
 852         // Check if model file is valid or not.
 853         std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
 854         std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
 855         if (key == mModelFormats.end()) {
 856                 LOGE("Invalid model file format.(ext = %s)", ext_str.c_str());
 857                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 858         }
 859
 860         LOGI("%s model file has been detected.", ext_str.c_str());
 861
 862         std::vector<std::string> models;
 863
 864         inference_model_format_e model_format = static_cast<inference_model_format_e>(key->second);
 865
 866         // Push model file information to models vector properly according to detected model format.
 867         switch (model_format) {
 868         case INFERENCE_MODEL_CAFFE:
 869         case INFERENCE_MODEL_TF:
 870         case INFERENCE_MODEL_DARKNET:
 871         case INFERENCE_MODEL_DLDT:
 872         case INFERENCE_MODEL_ONNX:
 873         case INFERENCE_MODEL_VIVANTE:
 874                 models.push_back(mConfig.mWeightFilePath);
 875                 models.push_back(mConfig.mConfigFilePath);
 876                 break;
 877         case INFERENCE_MODEL_TFLITE:
 878         case INFERENCE_MODEL_TORCH:
 879         case INFERENCE_MODEL_NNTRAINER:
 880         case INFERENCE_MODEL_SNPE:
 881                 models.push_back(mConfig.mWeightFilePath);
 882                 break;
 883         default:
 884                 break;
 885         }
 886
 887         // Request model loading to backend engine.
 888         ret = mBackend->Load(models, model_format);
 889         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 890                 LOGE("Fail to load model");
 891                 mCanRun = false;
 892                 std::vector<std::string>().swap(models);
 893                 return ConvertEngineErrorToVisionError(ret);
 894         }
 895
 896         std::vector<std::string>().swap(models);
 897
 898         // Prepare input and output tensor buffers.
 899         ret = PrepareTenosrBuffers();
 900         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 901                 LOGE("Fail to prepare buffer");
 902                 mCanRun = false;
 903                 return ret;
 904         }
 905
 906         mCanRun = true;
 907
 908         LOGI("LEAVE");
 909
 910         return ConvertEngineErrorToVisionError(ret);
 911 }
 912
 913 int Inference::Preprocess(std::vector<mv_source_h> &mv_sources, std::vector<cv::Mat> &cv_sources)
 914 {
 915         unsigned int src_idx = 0;
 916
 917         for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
 918                 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
 919                 int data_type = ConvertToCv(tensor_buffer.data_type);
 920                 LayerInfo layerInfo;
 921                 Options opt;
 922                 mv_colorspace_e colorspace = MEDIA_VISION_COLORSPACE_INVALID;
 923
 924                 int ret = mv_source_get_colorspace(mv_sources[src_idx], &colorspace);
 925                 if (ret != MEDIA_VISION_ERROR_NONE) {
 926                         LOGE("Fail to get color space.");
 927                         return ret;
 928                 }
 929
 930                 if (mMetadata.GetInputMeta().IsParsed()) {
 931                         layerInfo = mMetadata.GetInputMeta().GetLayer().at(buffer.first);
 932
 933                         if (!mMetadata.GetInputMeta().GetOption().empty())
 934                                 opt = mMetadata.GetInputMeta().GetOption().at(buffer.first);
 935                 } else {
 936                         // Ps. in case of legacy way, there is no way to set model specific dequantization parameters - zero point and scale.
 937                         // TODO. find a proper way for it.
 938                         opt.normalization.use = true;
 939                         opt.normalization.mean.assign(3, mConfig.mMeanValue);
 940                         opt.normalization.std.assign(3, mConfig.mStdValue);
 941
 942                         layerInfo.name = buffer.first;
 943                         layerInfo.dims.push_back(mConfig.mTensorInfo.dim);
 944                         layerInfo.dims.push_back(mConfig.mTensorInfo.height);
 945                         layerInfo.dims.push_back(mConfig.mTensorInfo.width);
 946                         layerInfo.dims.push_back(mConfig.mTensorInfo.ch);
 947
 948                         // Ps. in case of legacy way, there is no way to use model specific color space but only fixed one.
 949                         // TODO. find a proper way for it.
 950                         layerInfo.colorSpace = MEDIA_VISION_COLORSPACE_RGB888;
 951                         layerInfo.dataType = mConfig.mDataType;
 952                         // TODO. find a proper way for setting the shape type. In case of legacy way, there is no way to change the shape type properly.
 953                         //       According to a given inference engine, different shape type can be needed.
 954                         layerInfo.shapeType = INFERENCE_TENSOR_SHAPE_NHWC;
 955                 }
 956
 957                 // TODO: try-catch{} error handling
 958                 ret = mPreProc.Run(cv_sources[src_idx++], colorspace, data_type, layerInfo, opt, tensor_buffer.buffer);
 959                 if (ret != MEDIA_VISION_ERROR_NONE) {
 960                         LOGE("Fail to run pre-process.");
 961                         return ret;
 962                 }
 963         }
 964
 965         return MEDIA_VISION_ERROR_NONE;
 966 }
 967
 968 int Inference::Run(std::vector<mv_source_h> &mvSources, std::vector<mv_rectangle_s> &rects)
 969 {
 970         int ret = INFERENCE_ENGINE_ERROR_NONE;
 971
 972         if (!mCanRun) {
 973                 LOGE("Invalid to run inference");
 974                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
 975         }
 976
 977         if (mvSources.empty()) {
 978                 LOGE("mvSources should contain only one cv source.");
 979                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 980         }
 981
 982         // We are able to request Only one input data for the inference as of now.
 983         if (mvSources.size() > 1) {
 984                 LOGE("It allows only one mv source for the inference.");
 985                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 986         }
 987
 988         if (!rects.empty() && rects.size() != mvSources.size()) {
 989                 LOGE("mvSources.size() should be same as rects.size() if rects.empty() is false.");
 990                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 991         }
 992
 993         if (mConfig.mTensorInfo.ch != 1 && mConfig.mTensorInfo.ch != 3) {
 994                 LOGE("Channel not supported.");
 995                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 996         }
 997
 998         std::vector<cv::Mat> cvSources;
 999
1000         ret = ConvertToCvSource(mvSources, cvSources, rects);
1001         if (ret != MEDIA_VISION_ERROR_NONE) {
1002                 LOGE("Fail to convert mv source to cv source.");
1003                 return ret;
1004         }
1005
1006         // mSourceSize is original input image's size
1007         // TODO. consider multiple cv sources.
1008         mSourceSize = cvSources[0].size();
1009
1010         ret = Preprocess(mvSources, cvSources);
1011         if (ret != MEDIA_VISION_ERROR_NONE) {
1012                 LOGE("Fail to preprocess given input sources.");
1013                 return ret;
1014         }
1015
1016         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1017         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1018                 return ret;
1019
1020         return ConvertOutputDataTypeToFloat();
1021 }
1022
1023 int Inference::Run(std::vector<void *> &buffer_objs)
1024 {
1025         int ret = INFERENCE_ENGINE_ERROR_NONE;
1026
1027         if (!mCanRun) {
1028                 LOGE("Invalid to run inference");
1029                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1030         }
1031
1032         if (buffer_objs.empty()) {
1033                 LOGE("cvSources should contain only one cv source.");
1034                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1035         }
1036
1037         // We are able to request Only one input data for the inference as of now.
1038         if (buffer_objs.size() > 1) {
1039                 LOGE("It allows only one source for the inference.");
1040                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1041         }
1042
1043         if (mInputTensorBuffers.getIETensorBuffer().size() != buffer_objs.size()) {
1044                 LOGE("Raw source count is not invalid.");
1045                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1046         }
1047
1048         unsigned int buffer_idx = 0;
1049
1050         for (auto &buffer : mInputTensorBuffers.getIETensorBuffer()) {
1051                 inference_engine_tensor_buffer &tensor_buffer = buffer.second;
1052                 inference_engine_tensor_buffer *buffer_obj =
1053                                 static_cast<inference_engine_tensor_buffer *>(buffer_objs[buffer_idx]);
1054
1055                 if (tensor_buffer.size != buffer_obj->size) {
1056                         LOGE("Raw buffer size is invalid.");
1057                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1058                 }
1059
1060                 LOGI("A number of tensor bytes : %zu", buffer_obj->size);
1061
1062                 memcpy(tensor_buffer.buffer, buffer_obj->buffer, tensor_buffer.size);
1063                 buffer_idx++;
1064         }
1065
1066         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1067         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1068                 return ret;
1069
1070         return ConvertOutputDataTypeToFloat();
1071 }
1072
1073 int Inference::Run()
1074 {
1075         int ret = INFERENCE_ENGINE_ERROR_NONE;
1076
1077         if (!mCanRun) {
1078                 LOGE("Invalid to run inference");
1079                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1080         }
1081
1082         ret = mBackend->Run(mInputTensorBuffers.getIETensorBuffer(), mOutputTensorBuffers.getIETensorBuffer());
1083         if (ret != INFERENCE_ENGINE_ERROR_NONE)
1084                 return ret;
1085
1086         return ConvertOutputDataTypeToFloat();
1087 }
1088
1089 std::pair<std::string, bool> Inference::GetSupportedInferenceBackend(int backend)
1090 {
1091         return mSupportedInferenceBackend[backend];
1092 }
1093
1094 int Inference::GetClassficationResults(ImageClassificationResults *results)
1095 {
1096         // Will contain top N results in ascending order.
1097         std::vector<std::pair<float, int> > topScore;
1098         auto threadHold = mConfig.mConfidenceThresHold;
1099         constexpr unsigned int default_top_number = 5;
1100         tensor_t outputTensorInfo;
1101
1102         // Get inference result and contain it to outputTensorInfo.
1103         int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1104         if (ret != MEDIA_VISION_ERROR_NONE) {
1105                 LOGE("Fail to get output result.");
1106                 return ret;
1107         }
1108
1109         PostProcess postProc;
1110         unsigned int classes = outputTensorInfo.dimInfo[0][1];
1111         unsigned int top_number = default_top_number;
1112
1113         if (mMetadata.GetOutputMeta().IsParsed()) {
1114                 OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1115                 std::vector<int> indexes = outputMetadata.GetScoreDimInfo().GetValidIndexAll();
1116
1117                 if (indexes.size() != 1) {
1118                         LOGE("Invalid dim size. It should be 1");
1119                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1120                 }
1121
1122                 if (!mOutputTensorBuffers.exist(outputMetadata.GetScoreName())) {
1123                         LOGE("output buffe is NULL");
1124                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1125                 }
1126
1127                 top_number = outputMetadata.GetScoreTopNumber();
1128                 threadHold = outputMetadata.GetScoreThreshold();
1129
1130                 classes = mOutputLayerProperty.layers[outputMetadata.GetScoreName()].shape[indexes[0]];
1131         }
1132
1133         postProc.ScoreClear(top_number);
1134
1135         auto *prediction = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1136
1137         LOGI("class count: %d", classes);
1138
1139         for (unsigned int idx = 0; idx < classes; ++idx) {
1140                 float value = prediction[idx];
1141
1142                 if (mMetadata.GetOutputMeta().IsParsed()) {
1143                         OutputMetadata outputMetadata = mMetadata.GetOutputMeta();
1144
1145                         if (outputMetadata.GetScoreDeQuant()) {
1146                                 value = PostProcess::dequant(value, outputMetadata.GetScoreDeQuantScale(),
1147                                                                                          outputMetadata.GetScoreDeQuantZeroPoint());
1148                         }
1149
1150                         if (outputMetadata.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID)
1151                                 value = PostProcess::sigmoid(value);
1152                 }
1153
1154                 if (value < threadHold)
1155                         continue;
1156
1157                 postProc.ScorePush(value, idx);
1158         }
1159
1160         postProc.ScorePop(topScore);
1161         results->number_of_classes = 0;
1162
1163         for (auto &score : topScore) {
1164                 LOGI("score: %.3f, threshold: %.3f", score.first, threadHold);
1165                 LOGI("idx:%d", score.second);
1166                 LOGI("classProb: %.3f", score.first);
1167
1168                 results->indices.push_back(score.second);
1169                 results->confidences.push_back(score.first);
1170                 results->names.push_back(mUserListName[score.second]);
1171                 results->number_of_classes++;
1172         }
1173
1174         LOGE("Inference: GetClassificationResults: %d\n", results->number_of_classes);
1175         return MEDIA_VISION_ERROR_NONE;
1176 }
1177
1178 int Inference::GetObjectDetectionResults(ObjectDetectionResults *results)
1179 {
1180         if (mMetadata.GetOutputMeta().IsParsed()) {
1181                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1182
1183                 // decoding type
1184                 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1185                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1186                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1187                                  outputMeta.GetScoreName().c_str());
1188                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1189                 }
1190
1191                 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1192                 if (boxIndexes.size() != 1) {
1193                         LOGE("Invalid dim size. It should be 1");
1194                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1195                 }
1196
1197                 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1198                 int numberOfObjects = 0;
1199
1200                 if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
1201                         std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1202                         if (scoreIndexes.size() != 1) {
1203                                 LOGE("Invalid dim size. It should be 1");
1204                                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1205                         }
1206                         numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1207                 }
1208
1209                 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1210                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1211                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1212                                                                  numberOfObjects);
1213
1214                 objDecoder.init();
1215                 objDecoder.decode();
1216                 results->number_of_objects = 0;
1217
1218                 auto &rLoc = results->locations;
1219
1220                 for (auto &box : objDecoder.getObjectAll()) {
1221                         results->indices.push_back(box.index);
1222                         results->names.push_back(mUserListName[box.index]);
1223                         results->confidences.push_back(box.score);
1224                         auto &bLoc = box.location;
1225
1226                         auto srcW = static_cast<double>(mSourceSize.width);
1227                         auto srcH = static_cast<double>(mSourceSize.height);
1228
1229                         auto halfW = (bLoc.x - bLoc.width * 0.5f);
1230                         auto halfH = (bLoc.y - bLoc.height * 0.5f);
1231
1232                         if (mMetadata.GetInputMeta().option.begin()->second.resizer == Resizer::LETTERBOX) {
1233                                 double dstW = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getWidth());
1234                                 double dstH = static_cast<double>(mMetadata.GetInputMeta().layer.begin()->second.getHeight());
1235                                 double scale = std::min(1.0, std::min(dstW / srcW, dstH / srcH));
1236                                 double padSize[] = { (dstW - (scale * srcW)) / 2.0, (dstH - (scale * srcH)) / 2.0 };
1237
1238                                 auto rect =
1239                                                 cv::Rect(static_cast<int>(std::min(srcW, std::max((halfW * dstW - padSize[0]) / scale, 0.0))),
1240                                                                  static_cast<int>(std::min(srcH, std::max((halfH * dstH - padSize[1]) / scale, 0.0))),
1241                                                                  static_cast<int>((bLoc.width * dstW) / scale + padSize[0]),
1242                                                                  static_cast<int>((bLoc.height * dstH) / scale + padSize[1]));
1243
1244                                 rect.width = (rect.x + rect.width) > srcW ? srcW - rect.x : rect.width;
1245                                 rect.height = (rect.y + rect.height) > srcH ? srcH - rect.y : rect.height;
1246
1247                                 rLoc.push_back(rect);
1248                         } else {
1249                                 rLoc.push_back(cv::Rect(halfW * srcW, halfH * srcH, bLoc.width * srcW, bLoc.height * srcH));
1250                         }
1251                         results->number_of_objects++;
1252                 }
1253
1254                 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1255         } else {
1256                 tensor_t outputTensorInfo;
1257
1258                 // Get inference result and contain it to outputTensorInfo.
1259                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1260                 if (ret != MEDIA_VISION_ERROR_NONE) {
1261                         LOGE("Fail to get output result.");
1262                         return ret;
1263                 }
1264
1265                 // In case of object detection,
1266                 // a model may apply post-process but others may not.
1267                 // Thus, those cases should be hanlded separately.
1268
1269                 float *boxes = nullptr;
1270                 float *classes = nullptr;
1271                 float *scores = nullptr;
1272                 int number_of_detections = 0;
1273
1274                 if (outputTensorInfo.dimInfo.size() == 1) {
1275                         // there is no way to know how many objects are detect unless the number of objects aren't
1276                         // provided. In the case, each backend should provide the number of results manually.
1277                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1278                         // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1279                         // indicates the image id. But it is useless if a batch mode isn't supported.
1280                         // So, use the 1st of 7.
1281
1282                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1283                         cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1284                                                                  outputTensorInfo.data[0]);
1285
1286                         // boxes
1287                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1288                         cv::Mat cvTop = cvOutputData.col(4).clone();
1289                         cv::Mat cvRight = cvOutputData.col(5).clone();
1290                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1291                         cv::Mat cvScores, cvClasses, cvBoxes;
1292                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1293
1294                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1295
1296                         // classes
1297                         cvClasses = cvOutputData.col(1).clone();
1298
1299                         // scores
1300                         cvScores = cvOutputData.col(2).clone();
1301
1302                         boxes = cvBoxes.ptr<float>(0);
1303                         classes = cvClasses.ptr<float>(0);
1304                         scores = cvScores.ptr<float>(0);
1305                 } else {
1306                         boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1307                         classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1308                         scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1309                         number_of_detections = (int) (*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1310                 }
1311
1312                 LOGI("number_of_detections = %d", number_of_detections);
1313
1314                 results->number_of_objects = 0;
1315
1316                 for (int idx = 0; idx < number_of_detections; ++idx) {
1317                         if (scores[idx] < mConfig.mConfidenceThresHold)
1318                                 continue;
1319
1320                         int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1321                         int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1322                         int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1323                         int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1324                         cv::Rect loc;
1325
1326                         loc.x = left;
1327                         loc.y = top;
1328                         loc.width = right - left + 1;
1329                         loc.height = bottom - top + 1;
1330
1331                         results->indices.push_back(static_cast<int>(classes[idx]));
1332                         results->confidences.push_back(scores[idx]);
1333                         results->names.push_back(mUserListName[static_cast<int>(classes[idx])]);
1334                         results->locations.push_back(loc);
1335                         results->number_of_objects++;
1336
1337                         LOGI("objectClass: %d", static_cast<int>(classes[idx]));
1338                         LOGI("confidence:%f", scores[idx]);
1339                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1340                 }
1341
1342                 LOGI("Inference: GetObjectDetectionResults: %d\n", results->number_of_objects);
1343         }
1344
1345         return MEDIA_VISION_ERROR_NONE;
1346 }
1347
1348 int Inference::GetFaceDetectionResults(FaceDetectionResults *results)
1349 {
1350         if (mMetadata.GetOutputMeta().IsParsed()) {
1351                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1352
1353                 // decoding type
1354                 if (!mOutputTensorBuffers.exist(outputMeta.GetBoxName()) ||
1355                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1356                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetBoxName().c_str(),
1357                                  outputMeta.GetScoreName().c_str());
1358                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1359                 }
1360
1361                 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
1362                 if (boxIndexes.size() != 1) {
1363                         LOGE("Invalid dim size. It should be 1");
1364                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1365                 }
1366
1367                 int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
1368                 int numberOfFaces = 0;
1369
1370                 if (outputMeta.GetBoxDecodingType() != INFERENCE_BOX_DECODING_TYPE_BYPASS) {
1371                         std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
1372                         if (scoreIndexes.size() != 1) {
1373                                 LOGE("Invaid dim size. It should be 1");
1374                                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
1375                         }
1376                         numberOfFaces = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
1377                 }
1378
1379                 ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
1380                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth()),
1381                                                                  static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight()),
1382                                                                  numberOfFaces);
1383
1384                 objDecoder.init();
1385                 objDecoder.decode();
1386                 results->number_of_faces = 0;
1387
1388                 for (auto &face : objDecoder.getObjectAll()) {
1389                         results->confidences.push_back(face.score);
1390                         results->locations.push_back(
1391                                         cv::Rect(static_cast<int>((face.location.x - face.location.width * 0.5f) *
1392                                                                                           static_cast<float>(mSourceSize.width)),
1393                                                          static_cast<int>((face.location.y - face.location.height * 0.5f) *
1394                                                                                           static_cast<float>(mSourceSize.height)),
1395                                                          static_cast<int>(face.location.width * static_cast<float>(mSourceSize.width)),
1396                                                          static_cast<int>(face.location.height * static_cast<float>(mSourceSize.height))));
1397                         results->number_of_faces++;
1398                 }
1399
1400                 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1401         } else {
1402                 tensor_t outputTensorInfo;
1403
1404                 // Get inference result and contain it to outputTensorInfo.
1405                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1406                 if (ret != MEDIA_VISION_ERROR_NONE) {
1407                         LOGE("Fail to get output result.");
1408                         return ret;
1409                 }
1410
1411                 // In case of object detection,
1412                 // a model may apply post-process but others may not.
1413                 // Thus, those cases should be handled separately.
1414
1415                 float *boxes = nullptr;
1416                 float *classes = nullptr;
1417                 float *scores = nullptr;
1418                 int number_of_detections = 0;
1419                 cv::Mat cvScores, cvClasses, cvBoxes;
1420
1421                 if (outputTensorInfo.dimInfo.size() == 1) {
1422                         // there is no way to know how many objects are detect unless the number of objects aren't
1423                         // provided. In the case, each backend should provide the number of results manually.
1424                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1425                         // written to the 1st element i.e., outputTensorInfo.data[0] (the shape is 1x1xNx7 and the 1st of 7
1426                         // indicates the image id. But it is useless if a batch mode isn't supported.
1427                         // So, use the 1st of 7.
1428
1429                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[0]));
1430                         cv::Mat cvOutputData(number_of_detections, outputTensorInfo.dimInfo[0][3], CV_32F,
1431                                                                  outputTensorInfo.data[0]);
1432
1433                         // boxes
1434                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1435                         cv::Mat cvTop = cvOutputData.col(4).clone();
1436                         cv::Mat cvRight = cvOutputData.col(5).clone();
1437                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1438                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1439                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1440
1441                         // classes
1442                         cvClasses = cvOutputData.col(1).clone();
1443
1444                         // scores
1445                         cvScores = cvOutputData.col(2).clone();
1446
1447                         boxes = cvBoxes.ptr<float>(0);
1448                         classes = cvClasses.ptr<float>(0);
1449                         scores = cvScores.ptr<float>(0);
1450                 } else {
1451                         boxes = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1452                         classes = reinterpret_cast<float *>(outputTensorInfo.data[1]);
1453                         scores = reinterpret_cast<float *>(outputTensorInfo.data[2]);
1454                         number_of_detections = static_cast<int>(*reinterpret_cast<float *>(outputTensorInfo.data[3]));
1455                 }
1456
1457                 results->number_of_faces = 0;
1458
1459                 for (int idx = 0; idx < number_of_detections; ++idx) {
1460                         if (scores[idx] < mConfig.mConfidenceThresHold)
1461                                 continue;
1462
1463                         int left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1464                         int top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1465                         int right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1466                         int bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1467                         cv::Rect loc;
1468
1469                         loc.x = left;
1470                         loc.y = top;
1471                         loc.width = right - left + 1;
1472                         loc.height = bottom - top + 1;
1473                         results->confidences.push_back(scores[idx]);
1474                         results->locations.push_back(loc);
1475                         results->number_of_faces++;
1476
1477                         LOGI("confidence:%f", scores[idx]);
1478                         LOGI("class: %f", classes[idx]);
1479                         LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx * 4 + 1], boxes[idx * 4 + 0], boxes[idx * 4 + 3],
1480                                  boxes[idx * 4 + 2]);
1481                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
1482                 }
1483
1484                 LOGE("Inference: GetFaceDetectionResults: %d\n", results->number_of_faces);
1485         }
1486
1487         return MEDIA_VISION_ERROR_NONE;
1488 }
1489
1490 int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *results)
1491 {
1492         LOGI("ENTER");
1493
1494         if (mMetadata.GetOutputMeta().IsParsed()) {
1495                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1496
1497                 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1498                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1499                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1500                                  outputMeta.GetScoreName().c_str());
1501                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1502                 }
1503
1504                 int heatMapWidth = 0;
1505                 int heatMapHeight = 0;
1506                 int heatMapChannel = 0;
1507                 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1508                 int number_of_landmarks = heatMapChannel;
1509
1510                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
1511                         LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
1512                         number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1513                                                                   outputMeta.GetLandmarkOffset();
1514                 } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
1515                         number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1516                 } else {
1517                         heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1518                                                                    .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1519                         heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1520                                                                         .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1521                         heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1522                                                                          .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1523                 }
1524
1525                 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1526
1527                 // decoding
1528                 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1529                                                                 number_of_landmarks);
1530
1531                 // initialize decorder queue with landmarks to be decoded.
1532                 int ret = poseDecoder.init();
1533                 if (ret != MEDIA_VISION_ERROR_NONE) {
1534                         LOGE("Fail to init poseDecoder");
1535                         return ret;
1536                 }
1537
1538                 float inputW = 1.f;
1539                 float inputH = 1.f;
1540
1541                 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1542                         inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1543                         inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1544                 }
1545
1546                 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1547                                                                         0.0 :
1548                                                                         outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1549
1550                 poseDecoder.decode(inputW, inputH, thresRadius);
1551
1552                 for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
1553                         results->locations.push_back(
1554                                         cv::Point(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
1555                                                           poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height)));
1556                 }
1557
1558                 results->number_of_landmarks = results->locations.size();
1559         } else {
1560                 tensor_t outputTensorInfo;
1561
1562                 // Get inference result and contain it to outputTensorInfo.
1563                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1564                 if (ret != MEDIA_VISION_ERROR_NONE) {
1565                         LOGE("Fail to get output result.");
1566                         return ret;
1567                 }
1568
1569                 int number_of_detections = outputTensorInfo.dimInfo[0][1] >> 1;
1570
1571                 results->number_of_landmarks = number_of_detections;
1572                 results->locations.resize(number_of_detections);
1573
1574                 LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
1575
1576                 float *loc = reinterpret_cast<float *>(outputTensorInfo.data[0]);
1577
1578                 for (auto &point : results->locations) {
1579                         point.x = static_cast<int>(*loc++ * mSourceSize.width);
1580                         point.y = static_cast<int>(*loc++ * mSourceSize.height);
1581
1582                         LOGI("x:%d, y:%d", point.x, point.y);
1583                 }
1584         }
1585
1586         LOGI("Inference: FacialLandmarkDetectionResults: %d\n", results->number_of_landmarks);
1587         return MEDIA_VISION_ERROR_NONE;
1588 }
1589
1590 int Inference::GetPoseLandmarkDetectionResults(std::unique_ptr<mv_inference_pose_s> &detectionResults, int width,
1591                                                                                            int height)
1592 {
1593         LOGI("ENTER");
1594
1595         auto poseResult = std::make_unique<mv_inference_pose_s>();
1596
1597         if (mMetadata.GetOutputMeta().IsParsed()) {
1598                 OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
1599
1600                 if (!mOutputTensorBuffers.exist(outputMeta.GetLandmarkName()) ||
1601                         !mOutputTensorBuffers.exist(outputMeta.GetScoreName())) {
1602                         LOGE("output buffers named of %s or %s are NULL", outputMeta.GetLandmarkName().c_str(),
1603                                  outputMeta.GetScoreName().c_str());
1604                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
1605                 }
1606
1607                 int heatMapWidth = 0;
1608                 int heatMapHeight = 0;
1609                 int heatMapChannel = 0;
1610
1611                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
1612                         outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
1613                         heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1614                                                                    .shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
1615                         heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1616                                                                         .shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
1617                         heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()]
1618                                                                          .shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
1619                 }
1620
1621                 LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
1622
1623                 std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
1624
1625                 // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
1626                 // the landmarkChannel is guessed from the shape of the landmark output tensor.
1627                 // Otherwise, it is guessed from the heatMapChannel. (heatMapChannel is used in default).
1628                 int landmarkChannel = heatMapChannel;
1629
1630                 if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS)
1631                         landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] /
1632                                                           outputMeta.GetLandmarkOffset();
1633                 else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL)
1634                         landmarkChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
1635
1636                 poseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
1637                                                                                                                                                    static_cast<int>(mUserListName.size());
1638
1639                 LOGE("number of landmarks per pose: %d", poseResult->number_of_landmarks_per_pose);
1640
1641                 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1642                         LOGE("Exceeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1643                                  MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1644                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1645                 }
1646
1647                 // decoding
1648                 PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, heatMapWidth, heatMapHeight, heatMapChannel,
1649                                                                 poseResult->number_of_landmarks_per_pose);
1650
1651                 // initialize decorder queue with landmarks to be decoded.
1652                 int ret = poseDecoder.init();
1653                 if (ret != MEDIA_VISION_ERROR_NONE) {
1654                         LOGE("Fail to init poseDecoder");
1655                         return ret;
1656                 }
1657
1658                 float inputW = 1.f;
1659                 float inputH = 1.f;
1660                 float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ?
1661                                                                         0.0 :
1662                                                                         outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
1663                 if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
1664                         inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getWidth());
1665                         inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.getHeight());
1666                 }
1667
1668                 poseDecoder.decode(inputW, inputH, thresRadius);
1669                 poseResult->number_of_poses = poseDecoder.getNumberOfPose();
1670
1671                 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1672                         for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; ++landmarkIndex) {
1673                                 int part = landmarkIndex;
1674                                 if (!mUserListName.empty()) {
1675                                         part = std::stoi(mUserListName[landmarkIndex]) - 1;
1676                                         if (part < 0) {
1677                                                 continue;
1678                                         }
1679                                 }
1680
1681                                 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1682                                 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1683                                                 poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
1684                                 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1685                                                 poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
1686                                 poseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
1687                                 poseResult->landmarks[poseIndex][landmarkIndex].score = poseDecoder.getScore(poseIndex, part);
1688                         }
1689                 }
1690
1691                 detectionResults = std::move(poseResult);
1692         } else {
1693                 tensor_t outputTensorInfo;
1694
1695                 // Get inference result and contain it to outputTensorInfo.
1696                 int ret = mOutputTensorBuffers.GetTensorInfo(mOutputLayerProperty, outputTensorInfo);
1697                 if (ret != MEDIA_VISION_ERROR_NONE) {
1698                         LOGE("Fail to get output result.");
1699                         return ret;
1700                 }
1701
1702                 cv::Mat reShapeTest(cv::Size(outputTensorInfo.dimInfo[0][2], outputTensorInfo.dimInfo[0][1]),
1703                                                         CV_32FC(outputTensorInfo.dimInfo[0][3]), outputTensorInfo.data[0]);
1704                 cv::Mat multiChannels[outputTensorInfo.dimInfo[0][3]];
1705
1706                 split(reShapeTest, multiChannels);
1707
1708                 float ratioX = static_cast<float>(outputTensorInfo.dimInfo[0][2]);
1709                 float ratioY = static_cast<float>(outputTensorInfo.dimInfo[0][1]);
1710
1711                 poseResult->number_of_poses = 1;
1712                 poseResult->number_of_landmarks_per_pose = outputTensorInfo.dimInfo[0][3];
1713
1714                 if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
1715                         LOGE("Exeeded maxinum number of landmarks per pose(%d >= %d).", poseResult->number_of_landmarks_per_pose,
1716                                  MAX_NUMBER_OF_LANDMARKS_PER_POSE);
1717                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1718                 }
1719
1720                 for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
1721                         for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; landmarkIndex++) {
1722                                 int part = landmarkIndex;
1723                                 if (!mUserListName.empty()) {
1724                                         part = std::stoi(mUserListName[landmarkIndex]) - 1;
1725                                         if (part < 0) {
1726                                                 continue;
1727                                         }
1728                                 }
1729
1730                                 cv::Mat heatMap = multiChannels[part];
1731                                 double score;
1732                                 cv::Point loc;
1733                                 cv::Point2f loc2f;
1734                                 cv::Mat blurredHeatMap;
1735
1736                                 cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
1737                                 cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
1738
1739                                 loc2f.x = (static_cast<float>(loc.x) / ratioX);
1740                                 loc2f.y = (static_cast<float>(loc.y) / ratioY);
1741
1742                                 LOGI("landmarkIndex[%2d] - mapping to [%2d]: x[%.3f], y[%.3f], score[%.3f]", landmarkIndex, part,
1743                                          loc2f.x, loc2f.y, score);
1744
1745                                 poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
1746                                 poseResult->landmarks[poseIndex][landmarkIndex].point.x =
1747                                                 static_cast<int>(static_cast<float>(width) * loc2f.x);
1748                                 poseResult->landmarks[poseIndex][landmarkIndex].point.y =
1749                                                 static_cast<int>(static_cast<float>(height) * loc2f.y);
1750                                 poseResult->landmarks[poseIndex][landmarkIndex].score = score;
1751                                 poseResult->landmarks[poseIndex][landmarkIndex].label = -1;
1752                         }
1753                 }
1754
1755                 detectionResults = std::move(poseResult);
1756         }
1757
1758         return MEDIA_VISION_ERROR_NONE;
1759 }
1760
1761 } /* Inference */
1762 } /* MediaVision */