mv_inference/inference/src/Inference.cpp

   1 /**
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "mv_private.h"
  18 #include "Inference.h"
  19 #include "InferenceIni.h"
  20
  21 #include <map>
  22
  23 #include <unistd.h>
  24 #include <fstream>
  25 #include <string>
  26 #include <queue>
  27 #include <algorithm>
  28
  29 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
  30 #define MV_INFERENCE_OUTPUT_NUMBERS_MIN 1
  31 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
  32 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
  33
  34 typedef enum {
  35         InputAttrNoType = 0,
  36         InputAttrFloat32 = 1,
  37         InputAttrInt32 = 2,
  38         InputAttrUInt8 = 3,
  39         InputAttrInt64 = 4,
  40         InputAttrString = 5,
  41         InputAttrBool = 6,
  42 } InputAttrType;
  43
  44 namespace mediavision
  45 {
  46 namespace inference
  47 {
  48         InferenceConfig::InferenceConfig() :
  49                         mConfigFilePath(),
  50                         mWeightFilePath(),
  51                         mUserFilePath(),
  52                         mDataType(MV_INFERENCE_DATA_FLOAT32),
  53                         mBackedType(MV_INFERENCE_BACKEND_NONE),
  54                         mTargetTypes(MV_INFERENCE_TARGET_NONE),
  55                         mConfidenceThresHold(),
  56                         mMeanValue(),
  57                         mStdValue(),
  58                         mMaxOutputNumbers(1)
  59         {
  60                 mTensorInfo.width = -1;
  61                 mTensorInfo.height = -1;
  62                 mTensorInfo.dim = -1;
  63                 mTensorInfo.ch = -1;
  64         }
  65
  66         Inference::Inference() :
  67                         mCanRun(),
  68                         mConfig(),
  69                         mBackendCapacity(),
  70                         mSupportedInferenceBackend(),
  71                         mInputSize(cv::Size()),
  72                         mCh(),
  73                         mDim(),
  74                         mDeviation(),
  75                         mMean(),
  76                         mThreshold(),
  77                         mOutputNumbers(),
  78                         mSourceSize(cv::Size()),
  79                         mInputBuffer(cv::Mat()),
  80                         engine_config(),
  81                         mBackend()
  82         {
  83                 LOGI("ENTER");
  84
  85                 mSupportedInferenceBackend.insert(std::make_pair(
  86                                 MV_INFERENCE_BACKEND_OPENCV, std::make_pair("opencv", false)));
  87                 mSupportedInferenceBackend.insert(std::make_pair(
  88                                 MV_INFERENCE_BACKEND_TFLITE, std::make_pair("tflite", false)));
  89                 mSupportedInferenceBackend.insert(std::make_pair(
  90                                 MV_INFERENCE_BACKEND_ARMNN, std::make_pair("armnn", false)));
  91                 mSupportedInferenceBackend.insert(std::make_pair(
  92                                 MV_INFERENCE_BACKEND_MLAPI, std::make_pair("mlapi", false)));
  93                 mSupportedInferenceBackend.insert(std::make_pair(
  94                                 MV_INFERENCE_BACKEND_NNFW, std::make_pair("mlapi", false)));
  95
  96                 CheckSupportedInferenceBackend();
  97
  98                 for (int i = 0; i < MV_INFERENCE_BACKEND_MAX; ++i) {
  99                         auto iter = mSupportedInferenceBackend.find(i);
 100                         LOGE("%d: %s: %s", i, (iter->second).first.c_str(),
 101                                  (iter->second).second ? "TRUE" : "FALSE");
 102                 }
 103
 104                 mModelFormats.insert(std::make_pair<std::string, int>(
 105                                 "caffemodel", INFERENCE_MODEL_CAFFE));
 106                 mModelFormats.insert(
 107                                 std::make_pair<std::string, int>("pb", INFERENCE_MODEL_TF));
 108                 mModelFormats.insert(std::make_pair<std::string, int>(
 109                                 "tflite", INFERENCE_MODEL_TFLITE));
 110                 mModelFormats.insert(
 111                                 std::make_pair<std::string, int>("t7", INFERENCE_MODEL_TORCH));
 112                 mModelFormats.insert(std::make_pair<std::string, int>(
 113                                 "weights", INFERENCE_MODEL_DARKNET));
 114                 mModelFormats.insert(
 115                                 std::make_pair<std::string, int>("bin", INFERENCE_MODEL_DLDT));
 116                 mModelFormats.insert(
 117                                 std::make_pair<std::string, int>("onnx", INFERENCE_MODEL_ONNX));
 118                 mModelFormats.insert(std::make_pair<std::string, int>(
 119                                 "nb", INFERENCE_MODEL_VIVANTE));
 120
 121                 LOGI("LEAVE");
 122         }
 123
 124         Inference::~Inference()
 125         {
 126                 CleanupTensorBuffers();
 127
 128                 if (!mInputLayerProperty.tensor_infos.empty()) {
 129                         mInputLayerProperty.tensor_infos.clear();
 130                         std::vector<inference_engine_tensor_info>().swap(
 131                                         mInputLayerProperty.tensor_infos);
 132                 }
 133                 if (!mOutputLayerProperty.tensor_infos.empty()) {
 134                         mOutputLayerProperty.tensor_infos.clear();
 135                         std::vector<inference_engine_tensor_info>().swap(
 136                                         mOutputLayerProperty.tensor_infos);
 137                 }
 138
 139                 mModelFormats.clear();
 140
 141                 // Release backend engine.
 142                 if (mBackend) {
 143                         mBackend->UnbindBackend();
 144                         delete mBackend;
 145                 }
 146
 147                 LOGI("Released backend engine.");
 148         }
 149
 150         void Inference::CheckSupportedInferenceBackend()
 151         {
 152                 LOGE("ENTER");
 153
 154                 InferenceInI ini;
 155                 ini.LoadInI();
 156
 157                 std::vector<int> supportedBackend = ini.GetSupportedInferenceEngines();
 158                 for (std::vector<int>::const_iterator it = supportedBackend.begin();
 159                          it != supportedBackend.end(); ++it) {
 160                         LOGE("engine: %d", *it);
 161
 162                         auto iter = mSupportedInferenceBackend.find(*it);
 163                         (iter->second).second = true;
 164                 }
 165
 166                 LOGE("LEAVE");
 167         }
 168
 169         int Inference::ConvertEngineErrorToVisionError(int error)
 170         {
 171                 int ret = MEDIA_VISION_ERROR_NONE;
 172
 173                 switch (error) {
 174                 case INFERENCE_ENGINE_ERROR_NONE:
 175                         ret = MEDIA_VISION_ERROR_NONE;
 176                         break;
 177                 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED:
 178                         ret = MEDIA_VISION_ERROR_NOT_SUPPORTED;
 179                         break;
 180                 case INFERENCE_ENGINE_ERROR_MSG_TOO_LONG:
 181                         ret = MEDIA_VISION_ERROR_MSG_TOO_LONG;
 182                         break;
 183                 case INFERENCE_ENGINE_ERROR_NO_DATA:
 184                         ret = MEDIA_VISION_ERROR_NO_DATA;
 185                         break;
 186                 case INFERENCE_ENGINE_ERROR_KEY_NOT_AVAILABLE:
 187                         ret = MEDIA_VISION_ERROR_KEY_NOT_AVAILABLE;
 188                         break;
 189                 case INFERENCE_ENGINE_ERROR_OUT_OF_MEMORY:
 190                         ret = MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 191                         break;
 192                 case INFERENCE_ENGINE_ERROR_INVALID_PARAMETER:
 193                         ret = MEDIA_VISION_ERROR_INVALID_PARAMETER;
 194                         break;
 195                 case INFERENCE_ENGINE_ERROR_INVALID_OPERATION:
 196                         ret = MEDIA_VISION_ERROR_INVALID_OPERATION;
 197                         break;
 198                 case INFERENCE_ENGINE_ERROR_PERMISSION_DENIED:
 199                         ret = MEDIA_VISION_ERROR_PERMISSION_DENIED;
 200                         break;
 201                 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED_FORMAT:
 202                         ret = MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
 203                         break;
 204                 case INFERENCE_ENGINE_ERROR_INTERNAL:
 205                         ret = MEDIA_VISION_ERROR_INTERNAL;
 206                         break;
 207                 case INFERENCE_ENGINE_ERROR_INVALID_DATA:
 208                         ret = MEDIA_VISION_ERROR_INVALID_DATA;
 209                         break;
 210                 case INFERENCE_ENGINE_ERROR_INVALID_PATH:
 211                         ret = MEDIA_VISION_ERROR_INVALID_PATH;
 212                         break;
 213                 default:
 214                         LOGE("Unknown inference engine error type");
 215                 }
 216
 217                 return ret;
 218         }
 219
 220         int Inference::ConvertTargetTypes(int given_types)
 221         {
 222                 int target_types = INFERENCE_TARGET_NONE;
 223
 224                 if (given_types & MV_INFERENCE_TARGET_DEVICE_CPU)
 225                         target_types |= INFERENCE_TARGET_CPU;
 226                 if (given_types & MV_INFERENCE_TARGET_DEVICE_GPU)
 227                         target_types |= INFERENCE_TARGET_GPU;
 228                 if (given_types & MV_INFERENCE_TARGET_DEVICE_CUSTOM)
 229                         target_types |= INFERENCE_TARGET_CUSTOM;
 230
 231                 return target_types;
 232         }
 233
 234         int Inference::ConvertToCv(int given_type)
 235         {
 236                 int type = 0;
 237
 238                 switch (given_type) {
 239                 case INFERENCE_TENSOR_DATA_TYPE_UINT8:
 240                         LOGI("Type is %d ch with UINT8", mCh);
 241                         type = mCh == 1 ? CV_8UC1 : CV_8UC3;
 242                         break;
 243                 case INFERENCE_TENSOR_DATA_TYPE_FLOAT32:
 244                         LOGI("Type is %d ch with FLOAT32", mCh);
 245                         type = mCh == 1 ? CV_32FC1 : CV_32FC3;
 246                         break;
 247                 default:
 248                         LOGI("unknown data type so FLOAT32 data type will be used in default");
 249                         type = mCh == 1 ? CV_32FC1 : CV_32FC3;
 250                         break;
 251                 }
 252
 253                 return type;
 254         }
 255
 256         inference_tensor_data_type_e Inference::ConvertToIE(int given_type)
 257         {
 258                 inference_tensor_data_type_e type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
 259
 260                 switch (given_type) {
 261                 case MV_INFERENCE_DATA_FLOAT32:
 262                         type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
 263                         break;
 264                 case MV_INFERENCE_DATA_UINT8:
 265                         type = INFERENCE_TENSOR_DATA_TYPE_UINT8;
 266                         break;
 267                 default:
 268                         LOGI("unknown data type so FLOAT32 data type will be used in default");
 269                         break;
 270                 }
 271
 272                 return type;
 273         }
 274
 275         int Inference::Preprocess(cv::Mat cvImg, cv::Mat cvDst, int data_type)
 276         {
 277                 mSourceSize = cvImg.size();
 278                 int width = mInputSize.width;
 279                 int height = mInputSize.height;
 280
 281                 cv::Mat sample;
 282                 if (cvImg.channels() == 3 && mCh == 1)
 283                         cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY);
 284                 else
 285                         sample = cvImg;
 286
 287                 // size
 288                 cv::Mat sampleResized;
 289                 if (sample.size() != cv::Size(width, height))
 290                         cv::resize(sample, sampleResized, cv::Size(width, height));
 291                 else
 292                         sampleResized = sample;
 293
 294                 // type
 295                 cv::Mat sampleFloat;
 296                 if (mCh == 3)
 297                         sampleResized.convertTo(sampleFloat, CV_32FC3);
 298                 else
 299                         sampleResized.convertTo(sampleFloat, CV_32FC1);
 300
 301                 // normalize
 302                 cv::Mat sampleNormalized;
 303                 cv::Mat meanMat;
 304                 if (mCh == 3)
 305                         meanMat = cv::Mat(sampleFloat.size(), CV_32FC3,
 306                                                           cv::Scalar((float) mMean, (float) mMean,
 307                                                           (float) mMean));
 308                 else
 309                         meanMat = cv::Mat(sampleFloat.size(), CV_32FC1,
 310                                                           cv::Scalar((float) mMean));
 311
 312                 cv::subtract(sampleFloat, meanMat, sampleNormalized);
 313
 314                 sampleNormalized /= static_cast<float>(mDeviation);
 315
 316                 sampleNormalized.convertTo(cvDst, data_type);
 317
 318                 return MEDIA_VISION_ERROR_NONE;
 319         }
 320
 321         int Inference::SetUserFile(std::string filename)
 322         {
 323                 std::ifstream fp(filename.c_str());
 324                 if (!fp.is_open()) {
 325                         return MEDIA_VISION_ERROR_INVALID_PATH;
 326                 }
 327
 328                 std::string userListName;
 329                 while (!fp.eof()) {
 330                         std::getline(fp, userListName);
 331                         if (userListName.length())
 332                                 mUserListName.push_back(userListName);
 333                 }
 334
 335                 fp.close();
 336
 337                 return MEDIA_VISION_ERROR_NONE;
 338         }
 339
 340         void Inference::ConfigureModelFiles(const std::string modelConfigFilePath,
 341                                                                                 const std::string modelWeightFilePath,
 342                                                                                 const std::string modelUserFilePath)
 343         {
 344                 LOGI("ENTER");
 345
 346                 mConfig.mConfigFilePath = modelConfigFilePath;
 347                 mConfig.mWeightFilePath = modelWeightFilePath;
 348                 mConfig.mUserFilePath = modelUserFilePath;
 349
 350                 LOGI("LEAVE");
 351         }
 352
 353         void Inference::ConfigureTensorInfo(int width, int height, int dim, int ch,
 354                                                                                 double stdValue, double meanValue)
 355         {
 356                 LOGI("ENTER");
 357
 358                 mConfig.mTensorInfo = { width, height, dim, ch };
 359                 mConfig.mStdValue = stdValue;
 360                 mConfig.mMeanValue = meanValue;
 361
 362                 LOGI("LEAVE");
 363         }
 364
 365         void Inference::ConfigureInputInfo(int width, int height, int dim, int ch,
 366                                                                            double stdValue, double meanValue,
 367                                                                            int dataType,
 368                                                                            const std::vector<std::string> names)
 369         {
 370                 LOGI("ENTER");
 371
 372                 mConfig.mTensorInfo = { width, height, dim, ch };
 373                 mConfig.mStdValue = stdValue;
 374                 mConfig.mMeanValue = meanValue;
 375                 mConfig.mDataType = static_cast<mv_inference_data_type_e>(dataType);
 376                 mConfig.mInputLayerNames = names;
 377
 378                 inference_engine_layer_property property;
 379                 // In case of that a inference plugin deosn't support to get properties,
 380                 // the tensor info given by a user will be used.
 381                 // If the plugin supports that, the given info will be ignored.
 382                 inference_engine_tensor_info tensor_info;
 383
 384                 tensor_info.data_type = ConvertToIE(dataType);
 385
 386                 // In case of OpenCV, only supports NCHW
 387                 tensor_info.shape_type = INFERENCE_TENSOR_SHAPE_NCHW;
 388                 // modify to handle multiple tensor infos
 389                 tensor_info.shape.push_back(mConfig.mTensorInfo.dim);
 390                 tensor_info.shape.push_back(mConfig.mTensorInfo.ch);
 391                 tensor_info.shape.push_back(mConfig.mTensorInfo.height);
 392                 tensor_info.shape.push_back(mConfig.mTensorInfo.width);
 393
 394                 tensor_info.size = 1;
 395                 for (std::vector<size_t>::iterator iter = tensor_info.shape.begin();
 396                          iter != tensor_info.shape.end(); ++iter) {
 397                         tensor_info.size *= (*iter);
 398                 }
 399
 400                 property.layer_names = mConfig.mInputLayerNames;
 401                 property.tensor_infos.push_back(tensor_info);
 402
 403                 int ret = mBackend->SetInputLayerProperty(property);
 404                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 405                         LOGE("Fail to set input layer property");
 406                 }
 407
 408                 LOGI("LEAVE");
 409         }
 410
 411         void Inference::ConfigureOutputInfo(const std::vector<std::string> names)
 412         {
 413                 LOGI("ENTER");
 414
 415                 mConfig.mOutputLayerNames = names;
 416
 417                 inference_engine_layer_property property;
 418
 419                 property.layer_names = names;
 420                 int ret = mBackend->SetOutputLayerProperty(property);
 421                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 422                         LOGE("Fail to set output layer property");
 423                 }
 424
 425                 LOGI("LEAVE");
 426         }
 427
 428         int Inference::ConfigureBackendType(
 429                         const mv_inference_backend_type_e backendType)
 430         {
 431                 std::pair<std::string, bool> backend =
 432                                 mSupportedInferenceBackend[backendType];
 433                 if (backend.second == false) {
 434                         LOGE("%s type is not supported", (backend.first).c_str());
 435                         return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
 436                 }
 437
 438                 LOGI("backend engine : %d", backendType);
 439
 440                 mConfig.mBackedType = backendType;
 441
 442                 return MEDIA_VISION_ERROR_NONE;
 443         }
 444
 445         int Inference::ConfigureTargetTypes(const int targetType)
 446         {
 447                 // Check if given target types are valid or not.
 448                 if (MV_INFERENCE_TARGET_NONE >= targetType ||
 449                         MV_INFERENCE_TARGET_MAX <= targetType) {
 450                         LOGE("Invalid target device.");
 451                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 452                 }
 453
 454                 LOGI("Before convering target types : %d", targetType);
 455
 456                 unsigned int new_type = MV_INFERENCE_TARGET_DEVICE_NONE;
 457
 458                 // Convert old type to new one.
 459                 switch (targetType) {
 460                 case MV_INFERENCE_TARGET_CPU:
 461                         new_type = MV_INFERENCE_TARGET_DEVICE_CPU;
 462                         break;
 463                 case MV_INFERENCE_TARGET_GPU:
 464                         new_type = MV_INFERENCE_TARGET_DEVICE_GPU;
 465                         break;
 466                 case MV_INFERENCE_TARGET_CUSTOM:
 467                         new_type = MV_INFERENCE_TARGET_DEVICE_CUSTOM;
 468                         break;
 469                 }
 470
 471                 LOGI("After convering target types : %d", new_type);
 472
 473                 mConfig.mTargetTypes = new_type;
 474
 475                 return MEDIA_VISION_ERROR_NONE;
 476         }
 477
 478         int Inference::ConfigureTargetDevices(const int targetDevices)
 479         {
 480                 // Check if given target types are valid or not.
 481                 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetDevices ||
 482                         MV_INFERENCE_TARGET_DEVICE_MAX <= targetDevices) {
 483                         LOGE("Invalid target device.");
 484                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 485                 }
 486
 487                 LOGI("target devices : %d", targetDevices);
 488
 489                 mConfig.mTargetTypes = targetDevices;
 490
 491                 return MEDIA_VISION_ERROR_NONE;
 492         }
 493
 494         void Inference::ConfigureOutput(const int maxOutputNumbers)
 495         {
 496                 mConfig.mMaxOutputNumbers = std::max(
 497                                 std::min(maxOutputNumbers, MV_INFERENCE_OUTPUT_NUMBERS_MAX),
 498                                 MV_INFERENCE_OUTPUT_NUMBERS_MIN);
 499         }
 500
 501         void Inference::ConfigureThreshold(const double threshold)
 502         {
 503                 mConfig.mConfidenceThresHold = std::max(
 504                                 std::min(threshold, MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX),
 505                                 MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN);
 506         }
 507
 508         void Inference::CleanupTensorBuffers(void)
 509         {
 510                 LOGI("ENTER");
 511
 512                 if (!mInputTensorBuffers.empty()) {
 513                         std::vector<inference_engine_tensor_buffer>::iterator iter;
 514                         for (iter = mInputTensorBuffers.begin();
 515                                  iter != mInputTensorBuffers.end(); iter++) {
 516                                 inference_engine_tensor_buffer tensor_buffer = *iter;
 517
 518                                 // If tensor buffer owner is a backend then skip to release the tensor buffer.
 519                                 // This tensor buffer will be released by the backend.
 520                                 if (tensor_buffer.owner_is_backend) {
 521                                         continue;
 522                                 }
 523
 524                                 if (tensor_buffer.data_type ==
 525                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32)
 526                                         delete[] static_cast<float *>(tensor_buffer.buffer);
 527                                 else
 528                                         delete[] static_cast<unsigned char *>(tensor_buffer.buffer);
 529                         }
 530
 531                         LOGI("input tensor buffers(%zu) have been released.",
 532                                  mInputTensorBuffers.size());
 533                         std::vector<inference_engine_tensor_buffer>().swap(
 534                                         mInputTensorBuffers);
 535                 }
 536
 537                 if (!mOutputTensorBuffers.empty()) {
 538                         std::vector<inference_engine_tensor_buffer>::iterator iter;
 539                         for (iter = mOutputTensorBuffers.begin();
 540                                  iter != mOutputTensorBuffers.end(); iter++) {
 541                                 inference_engine_tensor_buffer tensor_buffer = *iter;
 542
 543                                 // If tensor buffer owner is a backend then skip to release the tensor buffer.
 544                                 // This tensor buffer will be released by the backend.
 545                                 if (tensor_buffer.owner_is_backend) {
 546                                         continue;
 547                                 }
 548
 549                                 if (tensor_buffer.data_type ==
 550                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32)
 551                                         delete[] static_cast<float *>(tensor_buffer.buffer);
 552                                 else
 553                                         delete[] static_cast<unsigned char *>(tensor_buffer.buffer);
 554                         }
 555
 556                         LOGI("output tensor buffers(%zu) have been released.",
 557                                  mOutputTensorBuffers.size());
 558                         std::vector<inference_engine_tensor_buffer>().swap(
 559                                         mOutputTensorBuffers);
 560                 }
 561
 562                 LOGI("LEAVE");
 563         }
 564
 565         int Inference::PrepareTenosrBuffers(void)
 566         {
 567                 // If there are input and output tensor buffers allocated before then release the buffers.
 568                 // They will be allocated again according to a new model file to be loaded.
 569                 CleanupTensorBuffers();
 570
 571                 // IF model file is loaded again then the model type could be different so
 572                 // clean up input and output layer properties so that they can be updated again
 573                 // after reloading the model file.
 574                 if (!mInputLayerProperty.tensor_infos.empty()) {
 575                         mInputLayerProperty.tensor_infos.clear();
 576                         std::vector<inference_engine_tensor_info>().swap(
 577                                         mInputLayerProperty.tensor_infos);
 578                 }
 579                 if (!mOutputLayerProperty.tensor_infos.empty()) {
 580                         mOutputLayerProperty.tensor_infos.clear();
 581                         std::vector<inference_engine_tensor_info>().swap(
 582                                         mOutputLayerProperty.tensor_infos);
 583                 }
 584
 585                 // Get input tensor buffers from a backend engine if the backend engine allocated.
 586                 int ret = mBackend->GetInputTensorBuffers(mInputTensorBuffers);
 587                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 588                         LOGE("Fail to get input tensor buffers from backend engine.");
 589                         return ConvertEngineErrorToVisionError(ret);
 590                 }
 591
 592                 ret = mBackend->GetInputLayerProperty(mInputLayerProperty);
 593                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 594                         LOGE("Fail to get input layer property from backend engine.");
 595                         return ConvertEngineErrorToVisionError(ret);
 596                 }
 597
 598                 // If the backend engine isn't able to allocate input tensor buffers internally,
 599                 // then allocate the buffers at here.
 600                 if (mInputTensorBuffers.empty()) {
 601                         for (int i = 0; i < mInputLayerProperty.tensor_infos.size(); ++i) {
 602                                 inference_engine_tensor_info tensor_info =
 603                                                 mInputLayerProperty.tensor_infos[i];
 604                                 inference_engine_tensor_buffer tensor_buffer;
 605                                 if (tensor_info.data_type ==
 606                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32) {
 607                                         tensor_buffer.buffer = new float[tensor_info.size];
 608                                         tensor_buffer.size = tensor_info.size * 4;
 609                                 } else if (tensor_info.data_type ==
 610                                                    INFERENCE_TENSOR_DATA_TYPE_UINT8) {
 611                                         tensor_buffer.buffer = new unsigned char[tensor_info.size];
 612                                         tensor_buffer.size = tensor_info.size;
 613                                 } else if (tensor_info.data_type ==
 614                                                    INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
 615                                         tensor_buffer.buffer = new short[tensor_info.size];
 616                                         tensor_buffer.size = tensor_info.size;
 617                                 } else {
 618                                         LOGE("Invalid input tensor data type.");
 619                                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 620                                 }
 621
 622                                 if (tensor_buffer.buffer == NULL) {
 623                                         LOGE("Fail to allocate input tensor buffer.");
 624                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 625                                 }
 626
 627                                 LOGI("Allocated input tensor buffer(size = %zu, data type = %d)",
 628                                          tensor_info.size, tensor_info.data_type);
 629                                 tensor_buffer.owner_is_backend = 0;
 630                                 tensor_buffer.data_type = tensor_info.data_type;
 631                                 mInputTensorBuffers.push_back(tensor_buffer);
 632                         }
 633                 }
 634
 635                 LOGI("Input tensor buffer count is %zu", mInputTensorBuffers.size());
 636
 637                 // Get output tensor buffers from a backend engine if the backend engine allocated.
 638                 ret = mBackend->GetOutputTensorBuffers(mOutputTensorBuffers);
 639                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 640                         LOGE("Fail to get output tensor buffers from backend engine.");
 641                         return ConvertEngineErrorToVisionError(ret);
 642                 }
 643
 644                 ret = mBackend->GetOutputLayerProperty(mOutputLayerProperty);
 645                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 646                         LOGE("Fail to get output layer property from backend engine.");
 647                         return ConvertEngineErrorToVisionError(ret);
 648                 }
 649
 650                 // If the backend engine isn't able to allocate output tensor buffers internally,
 651                 // then allocate the buffers at here.
 652                 if (mOutputTensorBuffers.empty()) {
 653                         for (int i = 0; i < mOutputLayerProperty.tensor_infos.size(); ++i) {
 654                                 inference_engine_tensor_info tensor_info =
 655                                                 mOutputLayerProperty.tensor_infos[i];
 656                                 inference_engine_tensor_buffer tensor_buffer;
 657                                 if (tensor_info.data_type ==
 658                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32) {
 659                                         tensor_buffer.buffer = new float[tensor_info.size];
 660                                         tensor_buffer.size = tensor_info.size * 4;
 661                                 } else if (tensor_info.data_type ==
 662                                                    INFERENCE_TENSOR_DATA_TYPE_UINT8) {
 663                                         tensor_buffer.buffer = new char[tensor_info.size];
 664                                         tensor_buffer.size = tensor_info.size;
 665                                 } else if (tensor_info.data_type ==
 666                                                    INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
 667                                         tensor_buffer.buffer = new short[tensor_info.size];
 668                                         tensor_buffer.size = tensor_info.size;
 669                                 } else {
 670                                         LOGE("Invalid output tensor data type.");
 671                                         CleanupTensorBuffers();
 672                                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 673                                 }
 674
 675                                 if (tensor_buffer.buffer == NULL) {
 676                                         LOGE("Fail to allocate output tensor buffer.");
 677                                         CleanupTensorBuffers();
 678                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 679                                 }
 680
 681                                 LOGI("Allocated output tensor buffer(size = %zu, data type = %d)",
 682                                          tensor_info.size, tensor_info.data_type);
 683
 684                                 tensor_buffer.owner_is_backend = 0;
 685                                 tensor_buffer.data_type = tensor_info.data_type;
 686                                 mOutputTensorBuffers.push_back(tensor_buffer);
 687                         }
 688                 }
 689
 690                 LOGI("Output tensor buffer count is %zu", mOutputTensorBuffers.size());
 691
 692                 return MEDIA_VISION_ERROR_NONE;
 693         }
 694
 695         int Inference::FillOutputResult(tensor_t &outputData)
 696         {
 697                 for (int i = 0; i < mOutputLayerProperty.tensor_infos.size(); ++i) {
 698                         inference_engine_tensor_info tensor_info =
 699                                         mOutputLayerProperty.tensor_infos[i];
 700
 701                         std::vector<int> tmpDimInfo;
 702                         for (int i = 0; i < static_cast<int>(tensor_info.shape.size());
 703                                  i++) {
 704                                 tmpDimInfo.push_back(tensor_info.shape[i]);
 705                         }
 706
 707                         outputData.dimInfo.push_back(tmpDimInfo);
 708
 709                         // Normalize output tensor data converting it to float type in case of quantized model.
 710                         if (tensor_info.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT8) {
 711                                 float *new_buf = new float[tensor_info.size];
 712                                 if (new_buf == NULL) {
 713                                         LOGE("Fail to allocate a new output tensor buffer.");
 714                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 715                                 }
 716
 717                                 unsigned char *ori_buf = static_cast<unsigned char *>(
 718                                                 mOutputTensorBuffers[i].buffer);
 719
 720                                 for (int j = 0; j < tensor_info.size; j++) {
 721                                         new_buf[j] = static_cast<float>(ori_buf[j]) / 255.0f;
 722                                 }
 723
 724                                 // replace original buffer with new one, and release origin one.
 725                                 mOutputTensorBuffers[i].buffer = new_buf;
 726
 727                                 if (!mOutputTensorBuffers[i].owner_is_backend)
 728                                         delete[] ori_buf;
 729                         }
 730
 731                         if (tensor_info.data_type == INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
 732                                 float *new_buf = new float[tensor_info.size];
 733                                 if (new_buf == NULL) {
 734                                         LOGE("Fail to allocate a new output tensor buffer.");
 735                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 736                                 }
 737
 738                                 short *ori_buf =
 739                                                 static_cast<short *>(mOutputTensorBuffers[i].buffer);
 740
 741                                 for (int j = 0; j < tensor_info.size; j++) {
 742                                         new_buf[j] = static_cast<float>(ori_buf[j]);
 743                                 }
 744
 745                                 // replace original buffer with new one, and release origin one.
 746                                 mOutputTensorBuffers[i].buffer = new_buf;
 747
 748                                 if (!mOutputTensorBuffers[i].owner_is_backend)
 749                                         delete[] ori_buf;
 750                         }
 751
 752                         outputData.data.push_back(
 753                                         static_cast<void *>(mOutputTensorBuffers[i].buffer));
 754                 }
 755
 756                 return MEDIA_VISION_ERROR_NONE;
 757         }
 758
 759         int Inference::Bind(void)
 760         {
 761                 LOGI("ENTER");
 762
 763                 if (mConfig.mBackedType <= MV_INFERENCE_BACKEND_NONE ||
 764                         mConfig.mBackedType >= MV_INFERENCE_BACKEND_MAX) {
 765                         LOGE("NOT SUPPORTED BACKEND %d", mConfig.mBackedType);
 766                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
 767                 }
 768
 769                 auto iter = mSupportedInferenceBackend.find(mConfig.mBackedType);
 770                 std::string backendName = (iter->second).first;
 771                 LOGI("backend string name: %s", backendName.c_str());
 772
 773                 inference_engine_config config = {
 774                         .backend_name = backendName,
 775                         .backend_type = mConfig.mBackedType,
 776                         // As a default, Target device is CPU. If user defined desired device type in json file
 777                         // then the device type will be set by Load callback.
 778                         .target_devices = mConfig.mTargetTypes,
 779                 };
 780
 781                 // Create a backend class object.
 782                 try {
 783                         mBackend = new InferenceEngineCommon();
 784                 } catch (const std::bad_alloc &ex) {
 785                         LOGE("Fail to create backend : %s", ex.what());
 786                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
 787                 }
 788
 789                 // Bind a backend library.
 790                 int ret = mBackend->BindBackend(&config);
 791                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 792                         LOGE("Fail to bind backend library.(%d)", mConfig.mBackedType);
 793                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
 794                 }
 795
 796                 // Get capacity information from a backend.
 797                 ret = mBackend->GetBackendCapacity(&mBackendCapacity);
 798                 if (ret != MEDIA_VISION_ERROR_NONE) {
 799                         LOGE("Fail to get backend capacity.");
 800                         return ret;
 801                 }
 802
 803                 LOGI("LEAVE");
 804
 805                 return MEDIA_VISION_ERROR_NONE;
 806         }
 807
 808         int Inference::Prepare(void)
 809         {
 810                 LOGI("ENTER");
 811
 812                 mCh = mConfig.mTensorInfo.ch;
 813                 mDim = mConfig.mTensorInfo.dim;
 814                 mInputSize =
 815                                 cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
 816                 LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
 817
 818                 mDeviation = mConfig.mStdValue;
 819                 mMean = mConfig.mMeanValue;
 820                 LOGI("mean %.4f, deviation %.4f", mMean, mDeviation);
 821
 822                 mOutputNumbers = mConfig.mMaxOutputNumbers;
 823                 LOGI("outputNumber %d", mOutputNumbers);
 824
 825                 mThreshold = mConfig.mConfidenceThresHold;
 826                 LOGI("threshold %.4f", mThreshold);
 827
 828                 // Check if backend supports a given target device/devices or not.
 829                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_CPU) {
 830                         if (!(mBackendCapacity.supported_accel_devices &
 831                                   INFERENCE_TARGET_CPU)) {
 832                                 LOGE("Backend doesn't support CPU device as an accelerator.");
 833                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 834                         }
 835                 }
 836
 837                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_GPU) {
 838                         if (!(mBackendCapacity.supported_accel_devices &
 839                                   INFERENCE_TARGET_GPU)) {
 840                                 LOGE("Backend doesn't support CPU device as an accelerator.");
 841                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 842                         }
 843                 }
 844
 845                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_CUSTOM) {
 846                         if (!(mBackendCapacity.supported_accel_devices &
 847                                   INFERENCE_TARGET_CUSTOM)) {
 848                                 LOGE("Backend doesn't support CPU device as an accelerator.");
 849                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 850                         }
 851                 }
 852
 853                 mBackend->SetTargetDevices(ConvertTargetTypes(mConfig.mTargetTypes));
 854
 855                 LOGI("LEAVE");
 856
 857                 return MEDIA_VISION_ERROR_NONE;
 858         }
 859
 860         int Inference::Load(void)
 861         {
 862                 LOGI("ENTER");
 863
 864                 std::string label_file = mConfig.mUserFilePath;
 865                 size_t userFileLength = label_file.length();
 866                 if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
 867                         LOGE("Label file path in [%s] ", label_file.c_str());
 868                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 869                 }
 870
 871                 int ret = (userFileLength > 0) ? SetUserFile(label_file) :
 872                                                                                  MEDIA_VISION_ERROR_NONE;
 873                 if (ret != MEDIA_VISION_ERROR_NONE) {
 874                         LOGE("Fail to load label file.");
 875                         return ret;
 876                 }
 877
 878                 // Check if model file is valid or not.
 879                 std::string ext_str = mConfig.mWeightFilePath.substr(
 880                                 mConfig.mWeightFilePath.find_last_of(".") + 1);
 881                 std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
 882                 if (key == mModelFormats.end()) {
 883                         LOGE("Invalid model file format.(ext = %s)", ext_str.c_str());
 884                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 885                 }
 886
 887                 LOGI("%s model file has been detected.", ext_str.c_str());
 888
 889                 std::vector<std::string> models;
 890
 891                 inference_model_format_e model_format =
 892                                 static_cast<inference_model_format_e>(key->second);
 893
 894                 // Push model file information to models vector properly according to detected model format.
 895                 switch (model_format) {
 896                 case INFERENCE_MODEL_CAFFE:
 897                 case INFERENCE_MODEL_TF:
 898                 case INFERENCE_MODEL_DARKNET:
 899                 case INFERENCE_MODEL_DLDT:
 900                 case INFERENCE_MODEL_ONNX:
 901                 case INFERENCE_MODEL_VIVANTE:
 902                         models.push_back(mConfig.mWeightFilePath);
 903                         models.push_back(mConfig.mConfigFilePath);
 904                         break;
 905                 case INFERENCE_MODEL_TFLITE:
 906                 case INFERENCE_MODEL_TORCH:
 907                         models.push_back(mConfig.mWeightFilePath);
 908                         break;
 909                 default:
 910                         break;
 911                 }
 912
 913                 // Request model loading to backend engine.
 914                 ret = mBackend->Load(models, model_format);
 915                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 916                         delete mBackend;
 917                         LOGE("Fail to load model");
 918                         mCanRun = false;
 919                         std::vector<std::string>().swap(models);
 920                         return ConvertEngineErrorToVisionError(ret);
 921                 }
 922
 923                 std::vector<std::string>().swap(models);
 924
 925                 // Prepare input and output tensor buffers.
 926                 PrepareTenosrBuffers();
 927
 928                 mCanRun = true;
 929
 930                 LOGI("LEAVE");
 931
 932                 return ConvertEngineErrorToVisionError(ret);
 933         }
 934
 935         int Inference::Run(std::vector<mv_source_h> &mvSources,
 936                                            std::vector<mv_rectangle_s> &rects)
 937         {
 938                 int ret = INFERENCE_ENGINE_ERROR_NONE;
 939
 940                 if (!mCanRun) {
 941                         LOGE("Invalid to run inference");
 942                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
 943                 }
 944
 945                 /* convert mv_source to cv::Mat */
 946                 cv::Mat cvSource;
 947                 cv::Rect cvRoi;
 948                 unsigned int width = 0, height = 0;
 949                 unsigned int bufferSize = 0;
 950                 unsigned char *buffer = NULL;
 951
 952                 if (mvSources.empty()) {
 953                         LOGE("mvSources should contain only one cv source.");
 954                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 955                 }
 956
 957                 // We are able to request Only one input data for the inference as of now.
 958                 if (mvSources.size() > 1) {
 959                         LOGE("It allows only one mv source for the inference.");
 960                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
 961                 }
 962
 963                 // TODO. Consider multiple sources.
 964                 mv_source_h mvSource = mvSources.front();
 965                 mv_rectangle_s *roi = rects.empty() ? NULL : &(rects.front());
 966
 967                 mv_colorspace_e colorspace = MEDIA_VISION_COLORSPACE_INVALID;
 968
 969                 if (mv_source_get_width(mvSource, &width) != MEDIA_VISION_ERROR_NONE ||
 970                         mv_source_get_height(mvSource, &height) !=
 971                                         MEDIA_VISION_ERROR_NONE ||
 972                         mv_source_get_colorspace(mvSource, &colorspace) !=
 973                                         MEDIA_VISION_ERROR_NONE ||
 974                         mv_source_get_buffer(mvSource, &buffer, &bufferSize))
 975                         return MEDIA_VISION_ERROR_INTERNAL;
 976
 977                 // TODO. Let's support various color spaces.
 978
 979                 if (colorspace != MEDIA_VISION_COLORSPACE_RGB888) {
 980                         LOGE("Not Supported format!\n");
 981                         return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
 982                 }
 983
 984                 if (roi == NULL) {
 985                         cvSource = cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, 3),
 986                                                            buffer)
 987                                                            .clone();
 988                 } else {
 989                         cvRoi.x = roi->point.x;
 990                         cvRoi.y = roi->point.y;
 991                         cvRoi.width = (roi->point.x + roi->width) >= width ?
 992                                                                   width - roi->point.x :
 993                                                                   roi->width;
 994                         cvRoi.height = (roi->point.y + roi->height) >= height ?
 995                                                                    height - roi->point.y :
 996                                                                    roi->height;
 997                         cvSource = cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, 3),
 998                                                            buffer)(cvRoi)
 999                                                            .clone();
1000                 }
1001
1002                 LOGE("Size: w:%u, h:%u", cvSource.size().width, cvSource.size().height);
1003
1004                 if (mCh != 1 && mCh != 3) {
1005                         LOGE("Channel not supported.");
1006                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1007                 }
1008
1009                 std::vector<inference_engine_tensor_buffer>::iterator iter;
1010                 for (iter = mInputTensorBuffers.begin();
1011                          iter != mInputTensorBuffers.end(); iter++) {
1012                         inference_engine_tensor_buffer tensor_buffer = *iter;
1013
1014                         int data_type = ConvertToCv(tensor_buffer.data_type);
1015
1016                         // Convert color space of input tensor data and then normalize it.
1017                         ret = Preprocess(cvSource,
1018                                                          cv::Mat(mInputSize.height, mInputSize.width,
1019                                                                          data_type, tensor_buffer.buffer),
1020                                                          data_type);
1021                         if (ret != MEDIA_VISION_ERROR_NONE) {
1022                                 LOGE("Fail to preprocess input tensor data.");
1023                                 return ret;
1024                         }
1025                 }
1026
1027                 ret = mBackend->Run(mInputTensorBuffers, mOutputTensorBuffers);
1028
1029                 return ConvertEngineErrorToVisionError(ret);
1030         }
1031
1032         std::pair<std::string, bool>
1033         Inference::GetSupportedInferenceBackend(int backend)
1034         {
1035                 return mSupportedInferenceBackend[backend];
1036         }
1037
1038         int Inference::GetClassficationResults(
1039                         ImageClassificationResults *classificationResults)
1040         {
1041                 tensor_t outputData;
1042
1043                 // Get inference result and contain it to outputData.
1044                 int ret = FillOutputResult(outputData);
1045                 if (ret != MEDIA_VISION_ERROR_NONE) {
1046                         LOGE("Fail to get output result.");
1047                         return ret;
1048                 }
1049
1050                 // Will contain top N results in ascending order.
1051                 std::vector<std::pair<float, int> > top_results;
1052                 std::priority_queue<std::pair<float, int>,
1053                                                         std::vector<std::pair<float, int> >,
1054                                                         std::greater<std::pair<float, int> > >
1055                                 top_result_pq;
1056                 float value = 0.0f;
1057
1058                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1059                 std::vector<void *> inferResults(outputData.data.begin(),
1060                                                                                  outputData.data.end());
1061
1062                 int count = inferDimInfo[0][1];
1063                 LOGI("count: %d", count);
1064
1065                 float *prediction = reinterpret_cast<float *>(inferResults[0]);
1066                 for (int i = 0; i < count; ++i) {
1067                         value = prediction[i];
1068
1069                         // Only add it if it beats the threshold and has a chance at being in
1070                         // the top N.
1071                         top_result_pq.push(std::pair<float, int>(value, i));
1072
1073                         // If at capacity, kick the smallest value out.
1074                         if (top_result_pq.size() > mOutputNumbers) {
1075                                 top_result_pq.pop();
1076                         }
1077                 }
1078
1079                 // Copy to output vector and reverse into descending order.
1080                 while (!top_result_pq.empty()) {
1081                         top_results.push_back(top_result_pq.top());
1082                         top_result_pq.pop();
1083                 }
1084                 std::reverse(top_results.begin(), top_results.end());
1085
1086                 int classIdx = -1;
1087                 ImageClassificationResults results;
1088                 results.number_of_classes = 0;
1089                 for (int idx = 0; idx < top_results.size(); ++idx) {
1090                         if (top_results[idx].first < mThreshold)
1091                                 continue;
1092                         LOGI("idx:%d", idx);
1093                         LOGI("classIdx: %d", top_results[idx].second);
1094                         LOGI("classProb: %f", top_results[idx].first);
1095
1096                         classIdx = top_results[idx].second;
1097                         results.indices.push_back(classIdx);
1098                         results.confidences.push_back(top_results[idx].first);
1099                         results.names.push_back(mUserListName[classIdx]);
1100                         results.number_of_classes++;
1101                 }
1102
1103                 *classificationResults = results;
1104                 LOGE("Inference: GetClassificationResults: %d\n",
1105                          results.number_of_classes);
1106                 return MEDIA_VISION_ERROR_NONE;
1107         }
1108
1109         int Inference::GetObjectDetectionResults(
1110                         ObjectDetectionResults *detectionResults)
1111         {
1112                 tensor_t outputData;
1113
1114                 // Get inference result and contain it to outputData.
1115                 int ret = FillOutputResult(outputData);
1116                 if (ret != MEDIA_VISION_ERROR_NONE) {
1117                         LOGE("Fail to get output result.");
1118                         return ret;
1119                 }
1120
1121                 // In case of object detection,
1122                 // a model may apply post-process but others may not.
1123                 // Thus, those cases should be hanlded separately.
1124                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1125                 LOGI("inferDimInfo size: %zu", outputData.dimInfo.size());
1126
1127                 std::vector<void *> inferResults(outputData.data.begin(),
1128                                                                                  outputData.data.end());
1129                 LOGI("inferResults size: %zu", inferResults.size());
1130
1131                 float *boxes = nullptr;
1132                 float *classes = nullptr;
1133                 float *scores = nullptr;
1134                 int number_of_detections = 0;
1135
1136                 cv::Mat cvScores, cvClasses, cvBoxes;
1137                 if (outputData.dimInfo.size() == 1) {
1138                         // there is no way to know how many objects are detect unless the number of objects aren't
1139                         // provided. In the case, each backend should provide the number of results manually.
1140                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1141                         // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
1142                         // indicats the image id. But it is useless if a batch mode isn't supported.
1143                         // So, use the 1st of 7.
1144
1145                         number_of_detections = static_cast<int>(
1146                                         *reinterpret_cast<float *>(outputData.data[0]));
1147                         cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3],
1148                                                                  CV_32F, outputData.data[0]);
1149
1150                         // boxes
1151                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1152                         cv::Mat cvTop = cvOutputData.col(4).clone();
1153                         cv::Mat cvRight = cvOutputData.col(5).clone();
1154                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1155
1156                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1157                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1158
1159                         // classes
1160                         cvClasses = cvOutputData.col(1).clone();
1161
1162                         // scores
1163                         cvScores = cvOutputData.col(2).clone();
1164
1165                         boxes = cvBoxes.ptr<float>(0);
1166                         classes = cvClasses.ptr<float>(0);
1167                         scores = cvScores.ptr<float>(0);
1168
1169                 } else {
1170                         boxes = reinterpret_cast<float *>(inferResults[0]);
1171                         classes = reinterpret_cast<float *>(inferResults[1]);
1172                         scores = reinterpret_cast<float *>(inferResults[2]);
1173                         number_of_detections =
1174                                         (int) (*reinterpret_cast<float *>(inferResults[3]));
1175                 }
1176
1177                 LOGI("number_of_detections = %d", number_of_detections);
1178
1179                 int left, top, right, bottom;
1180                 cv::Rect loc;
1181
1182                 ObjectDetectionResults results;
1183                 results.number_of_objects = 0;
1184                 for (int idx = 0; idx < number_of_detections; ++idx) {
1185                         if (scores[idx] < mThreshold)
1186                                 continue;
1187
1188                         left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1189                         top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1190                         right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1191                         bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1192
1193                         loc.x = left;
1194                         loc.y = top;
1195                         loc.width = right - left + 1;
1196                         loc.height = bottom - top + 1;
1197
1198                         results.indices.push_back(static_cast<int>(classes[idx]));
1199                         results.confidences.push_back(scores[idx]);
1200                         results.names.push_back(
1201                                         mUserListName[static_cast<int>(classes[idx])]);
1202                         results.locations.push_back(loc);
1203                         results.number_of_objects++;
1204
1205                         LOGI("objectClass: %d", static_cast<int>(classes[idx]));
1206                         LOGI("confidence:%f", scores[idx]);
1207                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right,
1208                                  bottom);
1209                 }
1210
1211                 *detectionResults = results;
1212                 LOGE("Inference: GetObjectDetectionResults: %d\n",
1213                          results.number_of_objects);
1214                 return MEDIA_VISION_ERROR_NONE;
1215         }
1216
1217         int
1218         Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
1219         {
1220                 tensor_t outputData;
1221
1222                 // Get inference result and contain it to outputData.
1223                 int ret = FillOutputResult(outputData);
1224                 if (ret != MEDIA_VISION_ERROR_NONE) {
1225                         LOGE("Fail to get output result.");
1226                         return ret;
1227                 }
1228
1229                 // In case of object detection,
1230                 // a model may apply post-process but others may not.
1231                 // Thus, those cases should be hanlded separately.
1232                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1233                 LOGI("inferDimInfo size: %zu", outputData.dimInfo.size());
1234
1235                 std::vector<void *> inferResults(outputData.data.begin(),
1236                                                                                  outputData.data.end());
1237                 LOGI("inferResults size: %zu", inferResults.size());
1238
1239                 float *boxes = nullptr;
1240                 float *classes = nullptr;
1241                 float *scores = nullptr;
1242                 int number_of_detections = 0;
1243
1244                 cv::Mat cvScores, cvClasses, cvBoxes;
1245                 if (outputData.dimInfo.size() == 1) {
1246                         // there is no way to know how many objects are detect unless the number of objects aren't
1247                         // provided. In the case, each backend should provide the number of results manually.
1248                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1249                         // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
1250                         // indicats the image id. But it is useless if a batch mode isn't supported.
1251                         // So, use the 1st of 7.
1252
1253                         number_of_detections = static_cast<int>(
1254                                         *reinterpret_cast<float *>(outputData.data[0]));
1255                         cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3],
1256                                                                  CV_32F, outputData.data[0]);
1257
1258                         // boxes
1259                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1260                         cv::Mat cvTop = cvOutputData.col(4).clone();
1261                         cv::Mat cvRight = cvOutputData.col(5).clone();
1262                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1263
1264                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1265                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1266
1267                         // classes
1268                         cvClasses = cvOutputData.col(1).clone();
1269
1270                         // scores
1271                         cvScores = cvOutputData.col(2).clone();
1272
1273                         boxes = cvBoxes.ptr<float>(0);
1274                         classes = cvClasses.ptr<float>(0);
1275                         scores = cvScores.ptr<float>(0);
1276
1277                 } else {
1278                         boxes = reinterpret_cast<float *>(inferResults[0]);
1279                         classes = reinterpret_cast<float *>(inferResults[1]);
1280                         scores = reinterpret_cast<float *>(inferResults[2]);
1281                         number_of_detections = static_cast<int>(
1282                                         *reinterpret_cast<float *>(inferResults[3]));
1283                 }
1284
1285                 int left, top, right, bottom;
1286                 cv::Rect loc;
1287
1288                 FaceDetectionResults results;
1289                 results.number_of_faces = 0;
1290                 for (int idx = 0; idx < number_of_detections; ++idx) {
1291                         if (scores[idx] < mThreshold)
1292                                 continue;
1293
1294                         left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1295                         top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1296                         right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1297                         bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1298
1299                         loc.x = left;
1300                         loc.y = top;
1301                         loc.width = right - left + 1;
1302                         loc.height = bottom - top + 1;
1303
1304                         results.confidences.push_back(scores[idx]);
1305                         results.locations.push_back(loc);
1306                         results.number_of_faces++;
1307
1308                         LOGI("confidence:%f", scores[idx]);
1309                         LOGI("class: %f", classes[idx]);
1310                         LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx * 4 + 1],
1311                                  boxes[idx * 4 + 0], boxes[idx * 4 + 3], boxes[idx * 4 + 2]);
1312                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right,
1313                                  bottom);
1314                 }
1315
1316                 *detectionResults = results;
1317                 LOGE("Inference: GetFaceDetectionResults: %d\n",
1318                          results.number_of_faces);
1319                 return MEDIA_VISION_ERROR_NONE;
1320         }
1321
1322         int Inference::GetFacialLandMarkDetectionResults(
1323                         FacialLandMarkDetectionResults *detectionResults)
1324         {
1325                 tensor_t outputData;
1326
1327                 // Get inference result and contain it to outputData.
1328                 int ret = FillOutputResult(outputData);
1329                 if (ret != MEDIA_VISION_ERROR_NONE) {
1330                         LOGE("Fail to get output result.");
1331                         return ret;
1332                 }
1333
1334                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1335                 std::vector<void *> inferResults(outputData.data.begin(),
1336                                                                                  outputData.data.end());
1337
1338                 long number_of_detections = inferDimInfo[0][1];
1339                 float *loc = reinterpret_cast<float *>(inferResults[0]);
1340
1341                 FacialLandMarkDetectionResults results;
1342                 results.number_of_landmarks = 0;
1343
1344                 cv::Point point(0, 0);
1345                 results.number_of_landmarks = 0;
1346                 LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
1347                 for (int idx = 0; idx < number_of_detections; idx += 2) {
1348                         point.x = static_cast<int>(loc[idx] * mSourceSize.width);
1349                         point.y = static_cast<int>(loc[idx + 1] * mSourceSize.height);
1350
1351                         results.locations.push_back(point);
1352                         results.number_of_landmarks++;
1353
1354                         LOGI("x:%d, y:%d", point.x, point.y);
1355                 }
1356
1357                 *detectionResults = results;
1358                 LOGE("Inference: FacialLandmarkDetectionResults: %d\n",
1359                          results.number_of_landmarks);
1360                 return MEDIA_VISION_ERROR_NONE;
1361         }
1362
1363         int Inference::GetPoseEstimationDetectionResults(
1364                         PoseEstimationResults *detectionResults)
1365         {
1366                 tensor_t outputData;
1367
1368                 // Get inference result and contain it to outputData.
1369                 int ret = FillOutputResult(outputData);
1370                 if (ret != MEDIA_VISION_ERROR_NONE) {
1371                         LOGE("Fail to get output result.");
1372                         return ret;
1373                 }
1374
1375                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1376                 std::vector<void *> inferResults(outputData.data.begin(),
1377                                                                                  outputData.data.end());
1378
1379                 long number_of_pose = inferDimInfo[0][3];
1380                 float *tmp = static_cast<float *>(inferResults[0]);
1381                 cv::Size heatMapSize(inferDimInfo[0][1], inferDimInfo[0][2]);
1382
1383                 cv::Point loc;
1384                 double score;
1385                 cv::Mat blurredHeatMap;
1386
1387                 cv::Mat reShapeTest(cv::Size(inferDimInfo[0][2], inferDimInfo[0][1]),
1388                                                         CV_32FC(inferDimInfo[0][3]), (void *) tmp);
1389
1390                 cv::Mat multiChannels[inferDimInfo[0][3]];
1391                 split(reShapeTest, multiChannels);
1392
1393                 float ratioX = static_cast<float>(mSourceSize.width) /
1394                                            static_cast<float>(inferDimInfo[0][2]);
1395                 float ratioY = static_cast<float>(mSourceSize.height) /
1396                                            static_cast<float>(inferDimInfo[0][1]);
1397
1398                 PoseEstimationResults results;
1399                 results.number_of_pose_estimation = 0;
1400                 for (int poseIdx = 0; poseIdx < number_of_pose; poseIdx++) {
1401                         cv::Mat heatMap = multiChannels[poseIdx];
1402
1403                         cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
1404                         cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
1405
1406                         LOGI("PoseIdx[%2d]: x[%2d], y[%2d], score[%.3f]", poseIdx, loc.x,
1407                                  loc.y, score);
1408                         LOGI("PoseIdx[%2d]: x[%2d], y[%2d], score[%.3f]", poseIdx,
1409                                  static_cast<int>(static_cast<float>(loc.x + 1) * ratioX),
1410                                  static_cast<int>(static_cast<float>(loc.y + 1) * ratioY),
1411                                  score);
1412
1413                         loc.x = static_cast<int>(static_cast<float>(loc.x + 1) * ratioX);
1414                         loc.y = static_cast<int>(static_cast<float>(loc.y + 1) * ratioY);
1415                         results.locations.push_back(loc);
1416                         results.number_of_pose_estimation++;
1417                 }
1418
1419                 *detectionResults = results;
1420                 LOGE("Inference: PoseEstimationResults: %d\n",
1421                          results.number_of_pose_estimation);
1422                 return MEDIA_VISION_ERROR_NONE;
1423         }
1424
1425 } /* Inference */
1426 } /* MediaVision */