62262adc63c19585ac696a5df06f51480e9d273b
[platform/core/api/mediavision.git] / mv_inference / inference / src / Inference.cpp
1 /**
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "mv_private.h"
18 #include "Inference.h"
19 #include "InferenceIni.h"
20
21 #include <map>
22
23 #include <unistd.h>
24 #include <fstream>
25 #include <string>
26 #include <queue>
27 #include <algorithm>
28
29 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
30 #define MV_INFERENCE_OUTPUT_NUMBERS_MIN 1
31 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
32 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
33
34 typedef enum {
35         InputAttrNoType = 0,
36         InputAttrFloat32 = 1,
37         InputAttrInt32 = 2,
38         InputAttrUInt8 = 3,
39         InputAttrInt64 = 4,
40         InputAttrString = 5,
41         InputAttrBool = 6,
42 } InputAttrType;
43
44 namespace mediavision
45 {
46 namespace inference
47 {
48         InferenceConfig::InferenceConfig() :
49                         mConfigFilePath(),
50                         mWeightFilePath(),
51                         mUserFilePath(),
52                         mDataType(MV_INFERENCE_DATA_FLOAT32),
53                         mBackedType(MV_INFERENCE_BACKEND_NONE),
54                         mTargetTypes(MV_INFERENCE_TARGET_NONE),
55                         mConfidenceThresHold(),
56                         mMeanValue(),
57                         mStdValue(),
58                         mMaxOutputNumbers(1)
59         {
60                 mTensorInfo.width = -1;
61                 mTensorInfo.height = -1;
62                 mTensorInfo.dim = -1;
63                 mTensorInfo.ch = -1;
64         }
65
66         Inference::Inference() :
67                         mCanRun(),
68                         mConfig(),
69                         mBackendCapacity(),
70                         mSupportedInferenceBackend(),
71                         mInputSize(cv::Size()),
72                         mCh(),
73                         mDim(),
74                         mDeviation(),
75                         mMean(),
76                         mThreshold(),
77                         mOutputNumbers(),
78                         mSourceSize(cv::Size()),
79                         mInputBuffer(cv::Mat()),
80                         engine_config(),
81                         mBackend()
82         {
83                 LOGI("ENTER");
84
85                 mSupportedInferenceBackend.insert(std::make_pair(
86                                 MV_INFERENCE_BACKEND_OPENCV, std::make_pair("opencv", false)));
87                 mSupportedInferenceBackend.insert(std::make_pair(
88                                 MV_INFERENCE_BACKEND_TFLITE, std::make_pair("tflite", false)));
89                 mSupportedInferenceBackend.insert(std::make_pair(
90                                 MV_INFERENCE_BACKEND_ARMNN, std::make_pair("armnn", false)));
91                 mSupportedInferenceBackend.insert(std::make_pair(
92                                 MV_INFERENCE_BACKEND_MLAPI, std::make_pair("mlapi", false)));
93                 mSupportedInferenceBackend.insert(std::make_pair(
94                                 MV_INFERENCE_BACKEND_NNFW, std::make_pair("mlapi", false)));
95
96                 CheckSupportedInferenceBackend();
97
98                 for (int i = 0; i < MV_INFERENCE_BACKEND_MAX; ++i) {
99                         auto iter = mSupportedInferenceBackend.find(i);
100                         LOGE("%d: %s: %s", i, (iter->second).first.c_str(),
101                                  (iter->second).second ? "TRUE" : "FALSE");
102                 }
103
104                 mModelFormats.insert(std::make_pair<std::string, int>(
105                                 "caffemodel", INFERENCE_MODEL_CAFFE));
106                 mModelFormats.insert(
107                                 std::make_pair<std::string, int>("pb", INFERENCE_MODEL_TF));
108                 mModelFormats.insert(std::make_pair<std::string, int>(
109                                 "tflite", INFERENCE_MODEL_TFLITE));
110                 mModelFormats.insert(
111                                 std::make_pair<std::string, int>("t7", INFERENCE_MODEL_TORCH));
112                 mModelFormats.insert(std::make_pair<std::string, int>(
113                                 "weights", INFERENCE_MODEL_DARKNET));
114                 mModelFormats.insert(
115                                 std::make_pair<std::string, int>("bin", INFERENCE_MODEL_DLDT));
116                 mModelFormats.insert(
117                                 std::make_pair<std::string, int>("onnx", INFERENCE_MODEL_ONNX));
118                 mModelFormats.insert(std::make_pair<std::string, int>(
119                                 "nb", INFERENCE_MODEL_VIVANTE));
120
121                 LOGI("LEAVE");
122         }
123
124         Inference::~Inference()
125         {
126                 CleanupTensorBuffers();
127
128                 if (!mInputLayerProperty.tensor_infos.empty()) {
129                         mInputLayerProperty.tensor_infos.clear();
130                         std::vector<inference_engine_tensor_info>().swap(
131                                         mInputLayerProperty.tensor_infos);
132                 }
133                 if (!mOutputLayerProperty.tensor_infos.empty()) {
134                         mOutputLayerProperty.tensor_infos.clear();
135                         std::vector<inference_engine_tensor_info>().swap(
136                                         mOutputLayerProperty.tensor_infos);
137                 }
138
139                 mModelFormats.clear();
140
141                 // Release backend engine.
142                 if (mBackend) {
143                         mBackend->UnbindBackend();
144                         delete mBackend;
145                 }
146
147                 LOGI("Released backend engine.");
148         }
149
150         void Inference::CheckSupportedInferenceBackend()
151         {
152                 LOGE("ENTER");
153
154                 InferenceInI ini;
155                 ini.LoadInI();
156
157                 std::vector<int> supportedBackend = ini.GetSupportedInferenceEngines();
158                 for (std::vector<int>::const_iterator it = supportedBackend.begin();
159                          it != supportedBackend.end(); ++it) {
160                         LOGE("engine: %d", *it);
161
162                         auto iter = mSupportedInferenceBackend.find(*it);
163                         (iter->second).second = true;
164                 }
165
166                 LOGE("LEAVE");
167         }
168
169         int Inference::ConvertEngineErrorToVisionError(int error)
170         {
171                 int ret = MEDIA_VISION_ERROR_NONE;
172
173                 switch (error) {
174                 case INFERENCE_ENGINE_ERROR_NONE:
175                         ret = MEDIA_VISION_ERROR_NONE;
176                         break;
177                 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED:
178                         ret = MEDIA_VISION_ERROR_NOT_SUPPORTED;
179                         break;
180                 case INFERENCE_ENGINE_ERROR_MSG_TOO_LONG:
181                         ret = MEDIA_VISION_ERROR_MSG_TOO_LONG;
182                         break;
183                 case INFERENCE_ENGINE_ERROR_NO_DATA:
184                         ret = MEDIA_VISION_ERROR_NO_DATA;
185                         break;
186                 case INFERENCE_ENGINE_ERROR_KEY_NOT_AVAILABLE:
187                         ret = MEDIA_VISION_ERROR_KEY_NOT_AVAILABLE;
188                         break;
189                 case INFERENCE_ENGINE_ERROR_OUT_OF_MEMORY:
190                         ret = MEDIA_VISION_ERROR_OUT_OF_MEMORY;
191                         break;
192                 case INFERENCE_ENGINE_ERROR_INVALID_PARAMETER:
193                         ret = MEDIA_VISION_ERROR_INVALID_PARAMETER;
194                         break;
195                 case INFERENCE_ENGINE_ERROR_INVALID_OPERATION:
196                         ret = MEDIA_VISION_ERROR_INVALID_OPERATION;
197                         break;
198                 case INFERENCE_ENGINE_ERROR_PERMISSION_DENIED:
199                         ret = MEDIA_VISION_ERROR_PERMISSION_DENIED;
200                         break;
201                 case INFERENCE_ENGINE_ERROR_NOT_SUPPORTED_FORMAT:
202                         ret = MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
203                         break;
204                 case INFERENCE_ENGINE_ERROR_INTERNAL:
205                         ret = MEDIA_VISION_ERROR_INTERNAL;
206                         break;
207                 case INFERENCE_ENGINE_ERROR_INVALID_DATA:
208                         ret = MEDIA_VISION_ERROR_INVALID_DATA;
209                         break;
210                 case INFERENCE_ENGINE_ERROR_INVALID_PATH:
211                         ret = MEDIA_VISION_ERROR_INVALID_PATH;
212                         break;
213                 default:
214                         LOGE("Unknown inference engine error type");
215                 }
216
217                 return ret;
218         }
219
220         int Inference::ConvertTargetTypes(int given_types)
221         {
222                 int target_types = INFERENCE_TARGET_NONE;
223
224                 if (given_types & MV_INFERENCE_TARGET_DEVICE_CPU)
225                         target_types |= INFERENCE_TARGET_CPU;
226                 if (given_types & MV_INFERENCE_TARGET_DEVICE_GPU)
227                         target_types |= INFERENCE_TARGET_GPU;
228                 if (given_types & MV_INFERENCE_TARGET_DEVICE_CUSTOM)
229                         target_types |= INFERENCE_TARGET_CUSTOM;
230
231                 return target_types;
232         }
233
234         int Inference::ConvertToCv(int given_type)
235         {
236                 int type = 0;
237
238                 switch (given_type) {
239                 case INFERENCE_TENSOR_DATA_TYPE_UINT8:
240                         LOGI("Type is %d ch with UINT8", mCh);
241                         type = mCh == 1 ? CV_8UC1 : CV_8UC3;
242                         break;
243                 case INFERENCE_TENSOR_DATA_TYPE_FLOAT32:
244                         LOGI("Type is %d ch with FLOAT32", mCh);
245                         type = mCh == 1 ? CV_32FC1 : CV_32FC3;
246                         break;
247                 default:
248                         LOGI("unknown data type so FLOAT32 data type will be used in default");
249                         type = mCh == 1 ? CV_32FC1 : CV_32FC3;
250                         break;
251                 }
252
253                 return type;
254         }
255
256         inference_tensor_data_type_e Inference::ConvertToIE(int given_type)
257         {
258                 inference_tensor_data_type_e type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
259
260                 switch (given_type) {
261                 case MV_INFERENCE_DATA_FLOAT32:
262                         type = INFERENCE_TENSOR_DATA_TYPE_FLOAT32;
263                         break;
264                 case MV_INFERENCE_DATA_UINT8:
265                         type = INFERENCE_TENSOR_DATA_TYPE_UINT8;
266                         break;
267                 default:
268                         LOGI("unknown data type so FLOAT32 data type will be used in default");
269                         break;
270                 }
271
272                 return type;
273         }
274
275         int Inference::Preprocess(cv::Mat cvImg, cv::Mat cvDst, int data_type)
276         {
277                 mSourceSize = cvImg.size();
278                 int width = mInputSize.width;
279                 int height = mInputSize.height;
280
281                 cv::Mat sample;
282                 if (cvImg.channels() == 3 && mCh == 1)
283                         cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY);
284                 else
285                         sample = cvImg;
286
287                 // size
288                 cv::Mat sampleResized;
289                 if (sample.size() != cv::Size(width, height))
290                         cv::resize(sample, sampleResized, cv::Size(width, height));
291                 else
292                         sampleResized = sample;
293
294                 // type
295                 cv::Mat sampleFloat;
296                 if (mCh == 3)
297                         sampleResized.convertTo(sampleFloat, CV_32FC3);
298                 else
299                         sampleResized.convertTo(sampleFloat, CV_32FC1);
300
301                 // normalize
302                 cv::Mat sampleNormalized;
303                 cv::Mat meanMat;
304                 if (mCh == 3)
305                         meanMat = cv::Mat(sampleFloat.size(), CV_32FC3,
306                                                           cv::Scalar((float) mMean, (float) mMean,
307                                                           (float) mMean));
308                 else
309                         meanMat = cv::Mat(sampleFloat.size(), CV_32FC1,
310                                                           cv::Scalar((float) mMean));
311
312                 cv::subtract(sampleFloat, meanMat, sampleNormalized);
313
314                 sampleNormalized /= static_cast<float>(mDeviation);
315
316                 sampleNormalized.convertTo(cvDst, data_type);
317
318                 return MEDIA_VISION_ERROR_NONE;
319         }
320
321         int Inference::SetUserFile(std::string filename)
322         {
323                 std::ifstream fp(filename.c_str());
324                 if (!fp.is_open()) {
325                         return MEDIA_VISION_ERROR_INVALID_PATH;
326                 }
327
328                 std::string userListName;
329                 while (!fp.eof()) {
330                         std::getline(fp, userListName);
331                         if (userListName.length())
332                                 mUserListName.push_back(userListName);
333                 }
334
335                 fp.close();
336
337                 return MEDIA_VISION_ERROR_NONE;
338         }
339
340         void Inference::ConfigureModelFiles(const std::string modelConfigFilePath,
341                                                                                 const std::string modelWeightFilePath,
342                                                                                 const std::string modelUserFilePath)
343         {
344                 LOGI("ENTER");
345
346                 mConfig.mConfigFilePath = modelConfigFilePath;
347                 mConfig.mWeightFilePath = modelWeightFilePath;
348                 mConfig.mUserFilePath = modelUserFilePath;
349
350                 LOGI("LEAVE");
351         }
352
353         void Inference::ConfigureTensorInfo(int width, int height, int dim, int ch,
354                                                                                 double stdValue, double meanValue)
355         {
356                 LOGI("ENTER");
357
358                 mConfig.mTensorInfo = { width, height, dim, ch };
359                 mConfig.mStdValue = stdValue;
360                 mConfig.mMeanValue = meanValue;
361
362                 LOGI("LEAVE");
363         }
364
365         void Inference::ConfigureInputInfo(int width, int height, int dim, int ch,
366                                                                            double stdValue, double meanValue,
367                                                                            int dataType,
368                                                                            const std::vector<std::string> names)
369         {
370                 LOGI("ENTER");
371
372                 mConfig.mTensorInfo = { width, height, dim, ch };
373                 mConfig.mStdValue = stdValue;
374                 mConfig.mMeanValue = meanValue;
375                 mConfig.mDataType = static_cast<mv_inference_data_type_e>(dataType);
376                 mConfig.mInputLayerNames = names;
377
378                 inference_engine_layer_property property;
379                 // In case of that a inference plugin deosn't support to get properties,
380                 // the tensor info given by a user will be used.
381                 // If the plugin supports that, the given info will be ignored.
382                 inference_engine_tensor_info tensor_info;
383
384                 tensor_info.data_type = ConvertToIE(dataType);
385
386                 // In case of OpenCV, only supports NCHW
387                 tensor_info.shape_type = INFERENCE_TENSOR_SHAPE_NCHW;
388                 // modify to handle multiple tensor infos
389                 tensor_info.shape.push_back(mConfig.mTensorInfo.dim);
390                 tensor_info.shape.push_back(mConfig.mTensorInfo.ch);
391                 tensor_info.shape.push_back(mConfig.mTensorInfo.height);
392                 tensor_info.shape.push_back(mConfig.mTensorInfo.width);
393
394                 tensor_info.size = 1;
395                 for (std::vector<size_t>::iterator iter = tensor_info.shape.begin();
396                          iter != tensor_info.shape.end(); ++iter) {
397                         tensor_info.size *= (*iter);
398                 }
399
400                 property.layer_names = mConfig.mInputLayerNames;
401                 property.tensor_infos.push_back(tensor_info);
402
403                 int ret = mBackend->SetInputLayerProperty(property);
404                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
405                         LOGE("Fail to set input layer property");
406                 }
407
408                 LOGI("LEAVE");
409         }
410
411         void Inference::ConfigureOutputInfo(const std::vector<std::string> names)
412         {
413                 LOGI("ENTER");
414
415                 mConfig.mOutputLayerNames = names;
416
417                 inference_engine_layer_property property;
418
419                 property.layer_names = names;
420                 int ret = mBackend->SetOutputLayerProperty(property);
421                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
422                         LOGE("Fail to set output layer property");
423                 }
424
425                 LOGI("LEAVE");
426         }
427
428         int Inference::ConfigureBackendType(
429                         const mv_inference_backend_type_e backendType)
430         {
431                 std::pair<std::string, bool> backend =
432                                 mSupportedInferenceBackend[backendType];
433                 if (backend.second == false) {
434                         LOGE("%s type is not supported", (backend.first).c_str());
435                         return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
436                 }
437
438                 LOGI("backend engine : %d", backendType);
439
440                 mConfig.mBackedType = backendType;
441
442                 return MEDIA_VISION_ERROR_NONE;
443         }
444
445         int Inference::ConfigureTargetTypes(const int targetType)
446         {
447                 // Check if given target types are valid or not.
448                 if (MV_INFERENCE_TARGET_NONE >= targetType ||
449                         MV_INFERENCE_TARGET_MAX <= targetType) {
450                         LOGE("Invalid target device.");
451                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
452                 }
453
454                 LOGI("Before convering target types : %d", targetType);
455
456                 unsigned int new_type = MV_INFERENCE_TARGET_DEVICE_NONE;
457
458                 // Convert old type to new one.
459                 switch (targetType) {
460                 case MV_INFERENCE_TARGET_CPU:
461                         new_type = MV_INFERENCE_TARGET_DEVICE_CPU;
462                         break;
463                 case MV_INFERENCE_TARGET_GPU:
464                         new_type = MV_INFERENCE_TARGET_DEVICE_GPU;
465                         break;
466                 case MV_INFERENCE_TARGET_CUSTOM:
467                         new_type = MV_INFERENCE_TARGET_DEVICE_CUSTOM;
468                         break;
469                 }
470
471                 LOGI("After convering target types : %d", new_type);
472
473                 mConfig.mTargetTypes = new_type;
474
475                 return MEDIA_VISION_ERROR_NONE;
476         }
477
478         int Inference::ConfigureTargetDevices(const int targetDevices)
479         {
480                 // Check if given target types are valid or not.
481                 if (MV_INFERENCE_TARGET_DEVICE_NONE >= targetDevices ||
482                         MV_INFERENCE_TARGET_DEVICE_MAX <= targetDevices) {
483                         LOGE("Invalid target device.");
484                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
485                 }
486
487                 LOGI("target devices : %d", targetDevices);
488
489                 mConfig.mTargetTypes = targetDevices;
490
491                 return MEDIA_VISION_ERROR_NONE;
492         }
493
494         void Inference::ConfigureOutput(const int maxOutputNumbers)
495         {
496                 mConfig.mMaxOutputNumbers = std::max(
497                                 std::min(maxOutputNumbers, MV_INFERENCE_OUTPUT_NUMBERS_MAX),
498                                 MV_INFERENCE_OUTPUT_NUMBERS_MIN);
499         }
500
501         void Inference::ConfigureThreshold(const double threshold)
502         {
503                 mConfig.mConfidenceThresHold = std::max(
504                                 std::min(threshold, MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX),
505                                 MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN);
506         }
507
508         void Inference::CleanupTensorBuffers(void)
509         {
510                 LOGI("ENTER");
511
512                 if (!mInputTensorBuffers.empty()) {
513                         std::vector<inference_engine_tensor_buffer>::iterator iter;
514                         for (iter = mInputTensorBuffers.begin();
515                                  iter != mInputTensorBuffers.end(); iter++) {
516                                 inference_engine_tensor_buffer tensor_buffer = *iter;
517
518                                 // If tensor buffer owner is a backend then skip to release the tensor buffer.
519                                 // This tensor buffer will be released by the backend.
520                                 if (tensor_buffer.owner_is_backend) {
521                                         continue;
522                                 }
523
524                                 if (tensor_buffer.data_type ==
525                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32)
526                                         delete[] static_cast<float *>(tensor_buffer.buffer);
527                                 else
528                                         delete[] static_cast<unsigned char *>(tensor_buffer.buffer);
529                         }
530
531                         LOGI("input tensor buffers(%zu) have been released.",
532                                  mInputTensorBuffers.size());
533                         std::vector<inference_engine_tensor_buffer>().swap(
534                                         mInputTensorBuffers);
535                 }
536
537                 if (!mOutputTensorBuffers.empty()) {
538                         std::vector<inference_engine_tensor_buffer>::iterator iter;
539                         for (iter = mOutputTensorBuffers.begin();
540                                  iter != mOutputTensorBuffers.end(); iter++) {
541                                 inference_engine_tensor_buffer tensor_buffer = *iter;
542
543                                 // If tensor buffer owner is a backend then skip to release the tensor buffer.
544                                 // This tensor buffer will be released by the backend.
545                                 if (tensor_buffer.owner_is_backend) {
546                                         continue;
547                                 }
548
549                                 if (tensor_buffer.data_type ==
550                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32)
551                                         delete[] static_cast<float *>(tensor_buffer.buffer);
552                                 else
553                                         delete[] static_cast<unsigned char *>(tensor_buffer.buffer);
554                         }
555
556                         LOGI("output tensor buffers(%zu) have been released.",
557                                  mOutputTensorBuffers.size());
558                         std::vector<inference_engine_tensor_buffer>().swap(
559                                         mOutputTensorBuffers);
560                 }
561
562                 LOGI("LEAVE");
563         }
564
565         int Inference::PrepareTenosrBuffers(void)
566         {
567                 // If there are input and output tensor buffers allocated before then release the buffers.
568                 // They will be allocated again according to a new model file to be loaded.
569                 CleanupTensorBuffers();
570
571                 // IF model file is loaded again then the model type could be different so
572                 // clean up input and output layer properties so that they can be updated again
573                 // after reloading the model file.
574                 if (!mInputLayerProperty.tensor_infos.empty()) {
575                         mInputLayerProperty.tensor_infos.clear();
576                         std::vector<inference_engine_tensor_info>().swap(
577                                         mInputLayerProperty.tensor_infos);
578                 }
579                 if (!mOutputLayerProperty.tensor_infos.empty()) {
580                         mOutputLayerProperty.tensor_infos.clear();
581                         std::vector<inference_engine_tensor_info>().swap(
582                                         mOutputLayerProperty.tensor_infos);
583                 }
584
585                 // Get input tensor buffers from a backend engine if the backend engine allocated.
586                 int ret = mBackend->GetInputTensorBuffers(mInputTensorBuffers);
587                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
588                         LOGE("Fail to get input tensor buffers from backend engine.");
589                         return ConvertEngineErrorToVisionError(ret);
590                 }
591
592                 ret = mBackend->GetInputLayerProperty(mInputLayerProperty);
593                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
594                         LOGE("Fail to get input layer property from backend engine.");
595                         return ConvertEngineErrorToVisionError(ret);
596                 }
597
598                 // If the backend engine isn't able to allocate input tensor buffers internally,
599                 // then allocate the buffers at here.
600                 if (mInputTensorBuffers.empty()) {
601                         for (int i = 0; i < mInputLayerProperty.tensor_infos.size(); ++i) {
602                                 inference_engine_tensor_info tensor_info =
603                                                 mInputLayerProperty.tensor_infos[i];
604                                 inference_engine_tensor_buffer tensor_buffer;
605                                 if (tensor_info.data_type ==
606                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32) {
607                                         tensor_buffer.buffer = new float[tensor_info.size];
608                                         tensor_buffer.size = tensor_info.size * 4;
609                                 } else if (tensor_info.data_type ==
610                                                    INFERENCE_TENSOR_DATA_TYPE_UINT8) {
611                                         tensor_buffer.buffer = new unsigned char[tensor_info.size];
612                                         tensor_buffer.size = tensor_info.size;
613                                 } else if (tensor_info.data_type ==
614                                                    INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
615                                         tensor_buffer.buffer = new short[tensor_info.size];
616                                         tensor_buffer.size = tensor_info.size;
617                                 } else {
618                                         LOGE("Invalid input tensor data type.");
619                                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
620                                 }
621
622                                 if (tensor_buffer.buffer == NULL) {
623                                         LOGE("Fail to allocate input tensor buffer.");
624                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
625                                 }
626
627                                 LOGI("Allocated input tensor buffer(size = %zu, data type = %d)",
628                                          tensor_info.size, tensor_info.data_type);
629                                 tensor_buffer.owner_is_backend = 0;
630                                 tensor_buffer.data_type = tensor_info.data_type;
631                                 mInputTensorBuffers.push_back(tensor_buffer);
632                         }
633                 }
634
635                 LOGI("Input tensor buffer count is %zu", mInputTensorBuffers.size());
636
637                 // Get output tensor buffers from a backend engine if the backend engine allocated.
638                 ret = mBackend->GetOutputTensorBuffers(mOutputTensorBuffers);
639                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
640                         LOGE("Fail to get output tensor buffers from backend engine.");
641                         return ConvertEngineErrorToVisionError(ret);
642                 }
643
644                 ret = mBackend->GetOutputLayerProperty(mOutputLayerProperty);
645                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
646                         LOGE("Fail to get output layer property from backend engine.");
647                         return ConvertEngineErrorToVisionError(ret);
648                 }
649
650                 // If the backend engine isn't able to allocate output tensor buffers internally,
651                 // then allocate the buffers at here.
652                 if (mOutputTensorBuffers.empty()) {
653                         for (int i = 0; i < mOutputLayerProperty.tensor_infos.size(); ++i) {
654                                 inference_engine_tensor_info tensor_info =
655                                                 mOutputLayerProperty.tensor_infos[i];
656                                 inference_engine_tensor_buffer tensor_buffer;
657                                 if (tensor_info.data_type ==
658                                         INFERENCE_TENSOR_DATA_TYPE_FLOAT32) {
659                                         tensor_buffer.buffer = new float[tensor_info.size];
660                                         tensor_buffer.size = tensor_info.size * 4;
661                                 } else if (tensor_info.data_type ==
662                                                    INFERENCE_TENSOR_DATA_TYPE_UINT8) {
663                                         tensor_buffer.buffer = new char[tensor_info.size];
664                                         tensor_buffer.size = tensor_info.size;
665                                 } else if (tensor_info.data_type ==
666                                                    INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
667                                         tensor_buffer.buffer = new short[tensor_info.size];
668                                         tensor_buffer.size = tensor_info.size;
669                                 } else {
670                                         LOGE("Invalid output tensor data type.");
671                                         CleanupTensorBuffers();
672                                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
673                                 }
674
675                                 if (tensor_buffer.buffer == NULL) {
676                                         LOGE("Fail to allocate output tensor buffer.");
677                                         CleanupTensorBuffers();
678                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
679                                 }
680
681                                 LOGI("Allocated output tensor buffer(size = %zu, data type = %d)",
682                                          tensor_info.size, tensor_info.data_type);
683
684                                 tensor_buffer.owner_is_backend = 0;
685                                 tensor_buffer.data_type = tensor_info.data_type;
686                                 mOutputTensorBuffers.push_back(tensor_buffer);
687                         }
688                 }
689
690                 LOGI("Output tensor buffer count is %zu", mOutputTensorBuffers.size());
691
692                 return MEDIA_VISION_ERROR_NONE;
693         }
694
695         int Inference::FillOutputResult(tensor_t &outputData)
696         {
697                 for (int i = 0; i < mOutputLayerProperty.tensor_infos.size(); ++i) {
698                         inference_engine_tensor_info tensor_info =
699                                         mOutputLayerProperty.tensor_infos[i];
700
701                         std::vector<int> tmpDimInfo;
702                         for (int i = 0; i < static_cast<int>(tensor_info.shape.size());
703                                  i++) {
704                                 tmpDimInfo.push_back(tensor_info.shape[i]);
705                         }
706
707                         outputData.dimInfo.push_back(tmpDimInfo);
708
709                         // Normalize output tensor data converting it to float type in case of quantized model.
710                         if (tensor_info.data_type == INFERENCE_TENSOR_DATA_TYPE_UINT8) {
711                                 float *new_buf = new float[tensor_info.size];
712                                 if (new_buf == NULL) {
713                                         LOGE("Fail to allocate a new output tensor buffer.");
714                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
715                                 }
716
717                                 unsigned char *ori_buf = static_cast<unsigned char *>(
718                                                 mOutputTensorBuffers[i].buffer);
719
720                                 for (int j = 0; j < tensor_info.size; j++) {
721                                         new_buf[j] = static_cast<float>(ori_buf[j]) / 255.0f;
722                                 }
723
724                                 // replace original buffer with new one, and release origin one.
725                                 mOutputTensorBuffers[i].buffer = new_buf;
726
727                                 if (!mOutputTensorBuffers[i].owner_is_backend)
728                                         delete[] ori_buf;
729                         }
730
731                         if (tensor_info.data_type == INFERENCE_TENSOR_DATA_TYPE_FLOAT16) {
732                                 float *new_buf = new float[tensor_info.size];
733                                 if (new_buf == NULL) {
734                                         LOGE("Fail to allocate a new output tensor buffer.");
735                                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
736                                 }
737
738                                 short *ori_buf =
739                                                 static_cast<short *>(mOutputTensorBuffers[i].buffer);
740
741                                 for (int j = 0; j < tensor_info.size; j++) {
742                                         new_buf[j] = static_cast<float>(ori_buf[j]);
743                                 }
744
745                                 // replace original buffer with new one, and release origin one.
746                                 mOutputTensorBuffers[i].buffer = new_buf;
747
748                                 if (!mOutputTensorBuffers[i].owner_is_backend)
749                                         delete[] ori_buf;
750                         }
751
752                         outputData.data.push_back(
753                                         static_cast<void *>(mOutputTensorBuffers[i].buffer));
754                 }
755
756                 return MEDIA_VISION_ERROR_NONE;
757         }
758
759         int Inference::Bind(void)
760         {
761                 LOGI("ENTER");
762
763                 if (mConfig.mBackedType <= MV_INFERENCE_BACKEND_NONE ||
764                         mConfig.mBackedType >= MV_INFERENCE_BACKEND_MAX) {
765                         LOGE("NOT SUPPORTED BACKEND %d", mConfig.mBackedType);
766                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
767                 }
768
769                 auto iter = mSupportedInferenceBackend.find(mConfig.mBackedType);
770                 std::string backendName = (iter->second).first;
771                 LOGI("backend string name: %s", backendName.c_str());
772
773                 inference_engine_config config = {
774                         .backend_name = backendName,
775                         .backend_type = mConfig.mBackedType,
776                         // As a default, Target device is CPU. If user defined desired device type in json file
777                         // then the device type will be set by Load callback.
778                         .target_devices = mConfig.mTargetTypes,
779                 };
780
781                 // Create a backend class object.
782                 try {
783                         mBackend = new InferenceEngineCommon();
784                 } catch (const std::bad_alloc &ex) {
785                         LOGE("Fail to create backend : %s", ex.what());
786                         return MEDIA_VISION_ERROR_OUT_OF_MEMORY;
787                 }
788
789                 // Bind a backend library.
790                 int ret = mBackend->BindBackend(&config);
791                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
792                         LOGE("Fail to bind backend library.(%d)", mConfig.mBackedType);
793                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
794                 }
795
796                 // Get capacity information from a backend.
797                 ret = mBackend->GetBackendCapacity(&mBackendCapacity);
798                 if (ret != MEDIA_VISION_ERROR_NONE) {
799                         LOGE("Fail to get backend capacity.");
800                         return ret;
801                 }
802
803                 LOGI("LEAVE");
804
805                 return MEDIA_VISION_ERROR_NONE;
806         }
807
808         int Inference::Prepare(void)
809         {
810                 LOGI("ENTER");
811
812                 mCh = mConfig.mTensorInfo.ch;
813                 mDim = mConfig.mTensorInfo.dim;
814                 mInputSize =
815                                 cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
816                 LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
817
818                 mDeviation = mConfig.mStdValue;
819                 mMean = mConfig.mMeanValue;
820                 LOGI("mean %.4f, deviation %.4f", mMean, mDeviation);
821
822                 mOutputNumbers = mConfig.mMaxOutputNumbers;
823                 LOGI("outputNumber %d", mOutputNumbers);
824
825                 mThreshold = mConfig.mConfidenceThresHold;
826                 LOGI("threshold %.4f", mThreshold);
827
828                 // Check if backend supports a given target device/devices or not.
829                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_CPU) {
830                         if (!(mBackendCapacity.supported_accel_devices &
831                                   INFERENCE_TARGET_CPU)) {
832                                 LOGE("Backend doesn't support CPU device as an accelerator.");
833                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
834                         }
835                 }
836
837                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_GPU) {
838                         if (!(mBackendCapacity.supported_accel_devices &
839                                   INFERENCE_TARGET_GPU)) {
840                                 LOGE("Backend doesn't support CPU device as an accelerator.");
841                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
842                         }
843                 }
844
845                 if (mConfig.mTargetTypes & MV_INFERENCE_TARGET_DEVICE_CUSTOM) {
846                         if (!(mBackendCapacity.supported_accel_devices &
847                                   INFERENCE_TARGET_CUSTOM)) {
848                                 LOGE("Backend doesn't support CPU device as an accelerator.");
849                                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
850                         }
851                 }
852
853                 mBackend->SetTargetDevices(ConvertTargetTypes(mConfig.mTargetTypes));
854
855                 LOGI("LEAVE");
856
857                 return MEDIA_VISION_ERROR_NONE;
858         }
859
860         int Inference::Load(void)
861         {
862                 LOGI("ENTER");
863
864                 std::string label_file = mConfig.mUserFilePath;
865                 size_t userFileLength = label_file.length();
866                 if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
867                         LOGE("Label file path in [%s] ", label_file.c_str());
868                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
869                 }
870
871                 int ret = (userFileLength > 0) ? SetUserFile(label_file) :
872                                                                                  MEDIA_VISION_ERROR_NONE;
873                 if (ret != MEDIA_VISION_ERROR_NONE) {
874                         LOGE("Fail to load label file.");
875                         return ret;
876                 }
877
878                 // Check if model file is valid or not.
879                 std::string ext_str = mConfig.mWeightFilePath.substr(
880                                 mConfig.mWeightFilePath.find_last_of(".") + 1);
881                 std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
882                 if (key == mModelFormats.end()) {
883                         LOGE("Invalid model file format.(ext = %s)", ext_str.c_str());
884                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
885                 }
886
887                 LOGI("%s model file has been detected.", ext_str.c_str());
888
889                 std::vector<std::string> models;
890
891                 inference_model_format_e model_format =
892                                 static_cast<inference_model_format_e>(key->second);
893
894                 // Push model file information to models vector properly according to detected model format.
895                 switch (model_format) {
896                 case INFERENCE_MODEL_CAFFE:
897                 case INFERENCE_MODEL_TF:
898                 case INFERENCE_MODEL_DARKNET:
899                 case INFERENCE_MODEL_DLDT:
900                 case INFERENCE_MODEL_ONNX:
901                 case INFERENCE_MODEL_VIVANTE:
902                         models.push_back(mConfig.mWeightFilePath);
903                         models.push_back(mConfig.mConfigFilePath);
904                         break;
905                 case INFERENCE_MODEL_TFLITE:
906                 case INFERENCE_MODEL_TORCH:
907                         models.push_back(mConfig.mWeightFilePath);
908                         break;
909                 default:
910                         break;
911                 }
912
913                 // Request model loading to backend engine.
914                 ret = mBackend->Load(models, model_format);
915                 if (ret != INFERENCE_ENGINE_ERROR_NONE) {
916                         delete mBackend;
917                         LOGE("Fail to load model");
918                         mCanRun = false;
919                         std::vector<std::string>().swap(models);
920                         return ConvertEngineErrorToVisionError(ret);
921                 }
922
923                 std::vector<std::string>().swap(models);
924
925                 // Prepare input and output tensor buffers.
926                 PrepareTenosrBuffers();
927
928                 mCanRun = true;
929
930                 LOGI("LEAVE");
931
932                 return ConvertEngineErrorToVisionError(ret);
933         }
934
935         int Inference::Run(std::vector<mv_source_h> &mvSources,
936                                            std::vector<mv_rectangle_s> &rects)
937         {
938                 int ret = INFERENCE_ENGINE_ERROR_NONE;
939
940                 if (!mCanRun) {
941                         LOGE("Invalid to run inference");
942                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
943                 }
944
945                 /* convert mv_source to cv::Mat */
946                 cv::Mat cvSource;
947                 cv::Rect cvRoi;
948                 unsigned int width = 0, height = 0;
949                 unsigned int bufferSize = 0;
950                 unsigned char *buffer = NULL;
951
952                 if (mvSources.empty()) {
953                         LOGE("mvSources should contain only one cv source.");
954                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
955                 }
956
957                 // We are able to request Only one input data for the inference as of now.
958                 if (mvSources.size() > 1) {
959                         LOGE("It allows only one mv source for the inference.");
960                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
961                 }
962
963                 // TODO. Consider multiple sources.
964                 mv_source_h mvSource = mvSources.front();
965                 mv_rectangle_s *roi = rects.empty() ? NULL : &(rects.front());
966
967                 mv_colorspace_e colorspace = MEDIA_VISION_COLORSPACE_INVALID;
968
969                 if (mv_source_get_width(mvSource, &width) != MEDIA_VISION_ERROR_NONE ||
970                         mv_source_get_height(mvSource, &height) !=
971                                         MEDIA_VISION_ERROR_NONE ||
972                         mv_source_get_colorspace(mvSource, &colorspace) !=
973                                         MEDIA_VISION_ERROR_NONE ||
974                         mv_source_get_buffer(mvSource, &buffer, &bufferSize))
975                         return MEDIA_VISION_ERROR_INTERNAL;
976
977                 // TODO. Let's support various color spaces.
978
979                 if (colorspace != MEDIA_VISION_COLORSPACE_RGB888) {
980                         LOGE("Not Supported format!\n");
981                         return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
982                 }
983
984                 if (roi == NULL) {
985                         cvSource = cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, 3),
986                                                            buffer)
987                                                            .clone();
988                 } else {
989                         cvRoi.x = roi->point.x;
990                         cvRoi.y = roi->point.y;
991                         cvRoi.width = (roi->point.x + roi->width) >= width ?
992                                                                   width - roi->point.x :
993                                                                   roi->width;
994                         cvRoi.height = (roi->point.y + roi->height) >= height ?
995                                                                    height - roi->point.y :
996                                                                    roi->height;
997                         cvSource = cv::Mat(cv::Size(width, height), CV_MAKETYPE(CV_8U, 3),
998                                                            buffer)(cvRoi)
999                                                            .clone();
1000                 }
1001
1002                 LOGE("Size: w:%u, h:%u", cvSource.size().width, cvSource.size().height);
1003
1004                 if (mCh != 1 && mCh != 3) {
1005                         LOGE("Channel not supported.");
1006                         return MEDIA_VISION_ERROR_INVALID_PARAMETER;
1007                 }
1008
1009                 std::vector<inference_engine_tensor_buffer>::iterator iter;
1010                 for (iter = mInputTensorBuffers.begin();
1011                          iter != mInputTensorBuffers.end(); iter++) {
1012                         inference_engine_tensor_buffer tensor_buffer = *iter;
1013
1014                         int data_type = ConvertToCv(tensor_buffer.data_type);
1015
1016                         // Convert color space of input tensor data and then normalize it.
1017                         ret = Preprocess(cvSource,
1018                                                          cv::Mat(mInputSize.height, mInputSize.width,
1019                                                                          data_type, tensor_buffer.buffer),
1020                                                          data_type);
1021                         if (ret != MEDIA_VISION_ERROR_NONE) {
1022                                 LOGE("Fail to preprocess input tensor data.");
1023                                 return ret;
1024                         }
1025                 }
1026
1027                 ret = mBackend->Run(mInputTensorBuffers, mOutputTensorBuffers);
1028
1029                 return ConvertEngineErrorToVisionError(ret);
1030         }
1031
1032         std::pair<std::string, bool>
1033         Inference::GetSupportedInferenceBackend(int backend)
1034         {
1035                 return mSupportedInferenceBackend[backend];
1036         }
1037
1038         int Inference::GetClassficationResults(
1039                         ImageClassificationResults *classificationResults)
1040         {
1041                 tensor_t outputData;
1042
1043                 // Get inference result and contain it to outputData.
1044                 int ret = FillOutputResult(outputData);
1045                 if (ret != MEDIA_VISION_ERROR_NONE) {
1046                         LOGE("Fail to get output result.");
1047                         return ret;
1048                 }
1049
1050                 // Will contain top N results in ascending order.
1051                 std::vector<std::pair<float, int> > top_results;
1052                 std::priority_queue<std::pair<float, int>,
1053                                                         std::vector<std::pair<float, int> >,
1054                                                         std::greater<std::pair<float, int> > >
1055                                 top_result_pq;
1056                 float value = 0.0f;
1057
1058                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1059                 std::vector<void *> inferResults(outputData.data.begin(),
1060                                                                                  outputData.data.end());
1061
1062                 int count = inferDimInfo[0][1];
1063                 LOGI("count: %d", count);
1064
1065                 float *prediction = reinterpret_cast<float *>(inferResults[0]);
1066                 for (int i = 0; i < count; ++i) {
1067                         value = prediction[i];
1068
1069                         // Only add it if it beats the threshold and has a chance at being in
1070                         // the top N.
1071                         top_result_pq.push(std::pair<float, int>(value, i));
1072
1073                         // If at capacity, kick the smallest value out.
1074                         if (top_result_pq.size() > mOutputNumbers) {
1075                                 top_result_pq.pop();
1076                         }
1077                 }
1078
1079                 // Copy to output vector and reverse into descending order.
1080                 while (!top_result_pq.empty()) {
1081                         top_results.push_back(top_result_pq.top());
1082                         top_result_pq.pop();
1083                 }
1084                 std::reverse(top_results.begin(), top_results.end());
1085
1086                 int classIdx = -1;
1087                 ImageClassificationResults results;
1088                 results.number_of_classes = 0;
1089                 for (int idx = 0; idx < top_results.size(); ++idx) {
1090                         if (top_results[idx].first < mThreshold)
1091                                 continue;
1092                         LOGI("idx:%d", idx);
1093                         LOGI("classIdx: %d", top_results[idx].second);
1094                         LOGI("classProb: %f", top_results[idx].first);
1095
1096                         classIdx = top_results[idx].second;
1097                         results.indices.push_back(classIdx);
1098                         results.confidences.push_back(top_results[idx].first);
1099                         results.names.push_back(mUserListName[classIdx]);
1100                         results.number_of_classes++;
1101                 }
1102
1103                 *classificationResults = results;
1104                 LOGE("Inference: GetClassificationResults: %d\n",
1105                          results.number_of_classes);
1106                 return MEDIA_VISION_ERROR_NONE;
1107         }
1108
1109         int Inference::GetObjectDetectionResults(
1110                         ObjectDetectionResults *detectionResults)
1111         {
1112                 tensor_t outputData;
1113
1114                 // Get inference result and contain it to outputData.
1115                 int ret = FillOutputResult(outputData);
1116                 if (ret != MEDIA_VISION_ERROR_NONE) {
1117                         LOGE("Fail to get output result.");
1118                         return ret;
1119                 }
1120
1121                 // In case of object detection,
1122                 // a model may apply post-process but others may not.
1123                 // Thus, those cases should be hanlded separately.
1124                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1125                 LOGI("inferDimInfo size: %zu", outputData.dimInfo.size());
1126
1127                 std::vector<void *> inferResults(outputData.data.begin(),
1128                                                                                  outputData.data.end());
1129                 LOGI("inferResults size: %zu", inferResults.size());
1130
1131                 float *boxes = nullptr;
1132                 float *classes = nullptr;
1133                 float *scores = nullptr;
1134                 int number_of_detections = 0;
1135
1136                 cv::Mat cvScores, cvClasses, cvBoxes;
1137                 if (outputData.dimInfo.size() == 1) {
1138                         // there is no way to know how many objects are detect unless the number of objects aren't
1139                         // provided. In the case, each backend should provide the number of results manually.
1140                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1141                         // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
1142                         // indicats the image id. But it is useless if a batch mode isn't supported.
1143                         // So, use the 1st of 7.
1144
1145                         number_of_detections = static_cast<int>(
1146                                         *reinterpret_cast<float *>(outputData.data[0]));
1147                         cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3],
1148                                                                  CV_32F, outputData.data[0]);
1149
1150                         // boxes
1151                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1152                         cv::Mat cvTop = cvOutputData.col(4).clone();
1153                         cv::Mat cvRight = cvOutputData.col(5).clone();
1154                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1155
1156                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1157                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1158
1159                         // classes
1160                         cvClasses = cvOutputData.col(1).clone();
1161
1162                         // scores
1163                         cvScores = cvOutputData.col(2).clone();
1164
1165                         boxes = cvBoxes.ptr<float>(0);
1166                         classes = cvClasses.ptr<float>(0);
1167                         scores = cvScores.ptr<float>(0);
1168
1169                 } else {
1170                         boxes = reinterpret_cast<float *>(inferResults[0]);
1171                         classes = reinterpret_cast<float *>(inferResults[1]);
1172                         scores = reinterpret_cast<float *>(inferResults[2]);
1173                         number_of_detections =
1174                                         (int) (*reinterpret_cast<float *>(inferResults[3]));
1175                 }
1176
1177                 LOGI("number_of_detections = %d", number_of_detections);
1178
1179                 int left, top, right, bottom;
1180                 cv::Rect loc;
1181
1182                 ObjectDetectionResults results;
1183                 results.number_of_objects = 0;
1184                 for (int idx = 0; idx < number_of_detections; ++idx) {
1185                         if (scores[idx] < mThreshold)
1186                                 continue;
1187
1188                         left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1189                         top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1190                         right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1191                         bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1192
1193                         loc.x = left;
1194                         loc.y = top;
1195                         loc.width = right - left + 1;
1196                         loc.height = bottom - top + 1;
1197
1198                         results.indices.push_back(static_cast<int>(classes[idx]));
1199                         results.confidences.push_back(scores[idx]);
1200                         results.names.push_back(
1201                                         mUserListName[static_cast<int>(classes[idx])]);
1202                         results.locations.push_back(loc);
1203                         results.number_of_objects++;
1204
1205                         LOGI("objectClass: %d", static_cast<int>(classes[idx]));
1206                         LOGI("confidence:%f", scores[idx]);
1207                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right,
1208                                  bottom);
1209                 }
1210
1211                 *detectionResults = results;
1212                 LOGE("Inference: GetObjectDetectionResults: %d\n",
1213                          results.number_of_objects);
1214                 return MEDIA_VISION_ERROR_NONE;
1215         }
1216
1217         int
1218         Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
1219         {
1220                 tensor_t outputData;
1221
1222                 // Get inference result and contain it to outputData.
1223                 int ret = FillOutputResult(outputData);
1224                 if (ret != MEDIA_VISION_ERROR_NONE) {
1225                         LOGE("Fail to get output result.");
1226                         return ret;
1227                 }
1228
1229                 // In case of object detection,
1230                 // a model may apply post-process but others may not.
1231                 // Thus, those cases should be hanlded separately.
1232                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1233                 LOGI("inferDimInfo size: %zu", outputData.dimInfo.size());
1234
1235                 std::vector<void *> inferResults(outputData.data.begin(),
1236                                                                                  outputData.data.end());
1237                 LOGI("inferResults size: %zu", inferResults.size());
1238
1239                 float *boxes = nullptr;
1240                 float *classes = nullptr;
1241                 float *scores = nullptr;
1242                 int number_of_detections = 0;
1243
1244                 cv::Mat cvScores, cvClasses, cvBoxes;
1245                 if (outputData.dimInfo.size() == 1) {
1246                         // there is no way to know how many objects are detect unless the number of objects aren't
1247                         // provided. In the case, each backend should provide the number of results manually.
1248                         // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
1249                         // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
1250                         // indicats the image id. But it is useless if a batch mode isn't supported.
1251                         // So, use the 1st of 7.
1252
1253                         number_of_detections = static_cast<int>(
1254                                         *reinterpret_cast<float *>(outputData.data[0]));
1255                         cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3],
1256                                                                  CV_32F, outputData.data[0]);
1257
1258                         // boxes
1259                         cv::Mat cvLeft = cvOutputData.col(3).clone();
1260                         cv::Mat cvTop = cvOutputData.col(4).clone();
1261                         cv::Mat cvRight = cvOutputData.col(5).clone();
1262                         cv::Mat cvBottom = cvOutputData.col(6).clone();
1263
1264                         cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
1265                         cv::hconcat(cvBoxElems, 4, cvBoxes);
1266
1267                         // classes
1268                         cvClasses = cvOutputData.col(1).clone();
1269
1270                         // scores
1271                         cvScores = cvOutputData.col(2).clone();
1272
1273                         boxes = cvBoxes.ptr<float>(0);
1274                         classes = cvClasses.ptr<float>(0);
1275                         scores = cvScores.ptr<float>(0);
1276
1277                 } else {
1278                         boxes = reinterpret_cast<float *>(inferResults[0]);
1279                         classes = reinterpret_cast<float *>(inferResults[1]);
1280                         scores = reinterpret_cast<float *>(inferResults[2]);
1281                         number_of_detections = static_cast<int>(
1282                                         *reinterpret_cast<float *>(inferResults[3]));
1283                 }
1284
1285                 int left, top, right, bottom;
1286                 cv::Rect loc;
1287
1288                 FaceDetectionResults results;
1289                 results.number_of_faces = 0;
1290                 for (int idx = 0; idx < number_of_detections; ++idx) {
1291                         if (scores[idx] < mThreshold)
1292                                 continue;
1293
1294                         left = static_cast<int>(boxes[idx * 4 + 1] * mSourceSize.width);
1295                         top = static_cast<int>(boxes[idx * 4 + 0] * mSourceSize.height);
1296                         right = static_cast<int>(boxes[idx * 4 + 3] * mSourceSize.width);
1297                         bottom = static_cast<int>(boxes[idx * 4 + 2] * mSourceSize.height);
1298
1299                         loc.x = left;
1300                         loc.y = top;
1301                         loc.width = right - left + 1;
1302                         loc.height = bottom - top + 1;
1303
1304                         results.confidences.push_back(scores[idx]);
1305                         results.locations.push_back(loc);
1306                         results.number_of_faces++;
1307
1308                         LOGI("confidence:%f", scores[idx]);
1309                         LOGI("class: %f", classes[idx]);
1310                         LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx * 4 + 1],
1311                                  boxes[idx * 4 + 0], boxes[idx * 4 + 3], boxes[idx * 4 + 2]);
1312                         LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right,
1313                                  bottom);
1314                 }
1315
1316                 *detectionResults = results;
1317                 LOGE("Inference: GetFaceDetectionResults: %d\n",
1318                          results.number_of_faces);
1319                 return MEDIA_VISION_ERROR_NONE;
1320         }
1321
1322         int Inference::GetFacialLandMarkDetectionResults(
1323                         FacialLandMarkDetectionResults *detectionResults)
1324         {
1325                 tensor_t outputData;
1326
1327                 // Get inference result and contain it to outputData.
1328                 int ret = FillOutputResult(outputData);
1329                 if (ret != MEDIA_VISION_ERROR_NONE) {
1330                         LOGE("Fail to get output result.");
1331                         return ret;
1332                 }
1333
1334                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1335                 std::vector<void *> inferResults(outputData.data.begin(),
1336                                                                                  outputData.data.end());
1337
1338                 long number_of_detections = inferDimInfo[0][1];
1339                 float *loc = reinterpret_cast<float *>(inferResults[0]);
1340
1341                 FacialLandMarkDetectionResults results;
1342                 results.number_of_landmarks = 0;
1343
1344                 cv::Point point(0, 0);
1345                 results.number_of_landmarks = 0;
1346                 LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
1347                 for (int idx = 0; idx < number_of_detections; idx += 2) {
1348                         point.x = static_cast<int>(loc[idx] * mSourceSize.width);
1349                         point.y = static_cast<int>(loc[idx + 1] * mSourceSize.height);
1350
1351                         results.locations.push_back(point);
1352                         results.number_of_landmarks++;
1353
1354                         LOGI("x:%d, y:%d", point.x, point.y);
1355                 }
1356
1357                 *detectionResults = results;
1358                 LOGE("Inference: FacialLandmarkDetectionResults: %d\n",
1359                          results.number_of_landmarks);
1360                 return MEDIA_VISION_ERROR_NONE;
1361         }
1362
1363         int Inference::GetPoseEstimationDetectionResults(
1364                         PoseEstimationResults *detectionResults)
1365         {
1366                 tensor_t outputData;
1367
1368                 // Get inference result and contain it to outputData.
1369                 int ret = FillOutputResult(outputData);
1370                 if (ret != MEDIA_VISION_ERROR_NONE) {
1371                         LOGE("Fail to get output result.");
1372                         return ret;
1373                 }
1374
1375                 std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
1376                 std::vector<void *> inferResults(outputData.data.begin(),
1377                                                                                  outputData.data.end());
1378
1379                 long number_of_pose = inferDimInfo[0][3];
1380                 float *tmp = static_cast<float *>(inferResults[0]);
1381                 cv::Size heatMapSize(inferDimInfo[0][1], inferDimInfo[0][2]);
1382
1383                 cv::Point loc;
1384                 double score;
1385                 cv::Mat blurredHeatMap;
1386
1387                 cv::Mat reShapeTest(cv::Size(inferDimInfo[0][2], inferDimInfo[0][1]),
1388                                                         CV_32FC(inferDimInfo[0][3]), (void *) tmp);
1389
1390                 cv::Mat multiChannels[inferDimInfo[0][3]];
1391                 split(reShapeTest, multiChannels);
1392
1393                 float ratioX = static_cast<float>(mSourceSize.width) /
1394                                            static_cast<float>(inferDimInfo[0][2]);
1395                 float ratioY = static_cast<float>(mSourceSize.height) /
1396                                            static_cast<float>(inferDimInfo[0][1]);
1397
1398                 PoseEstimationResults results;
1399                 results.number_of_pose_estimation = 0;
1400                 for (int poseIdx = 0; poseIdx < number_of_pose; poseIdx++) {
1401                         cv::Mat heatMap = multiChannels[poseIdx];
1402
1403                         cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
1404                         cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
1405
1406                         LOGI("PoseIdx[%2d]: x[%2d], y[%2d], score[%.3f]", poseIdx, loc.x,
1407                                  loc.y, score);
1408                         LOGI("PoseIdx[%2d]: x[%2d], y[%2d], score[%.3f]", poseIdx,
1409                                  static_cast<int>(static_cast<float>(loc.x + 1) * ratioX),
1410                                  static_cast<int>(static_cast<float>(loc.y + 1) * ratioY),
1411                                  score);
1412
1413                         loc.x = static_cast<int>(static_cast<float>(loc.x + 1) * ratioX);
1414                         loc.y = static_cast<int>(static_cast<float>(loc.y + 1) * ratioY);
1415                         results.locations.push_back(loc);
1416                         results.number_of_pose_estimation++;
1417                 }
1418
1419                 *detectionResults = results;
1420                 LOGE("Inference: PoseEstimationResults: %d\n",
1421                          results.number_of_pose_estimation);
1422                 return MEDIA_VISION_ERROR_NONE;
1423         }
1424
1425 } /* Inference */
1426 } /* MediaVision */