mv_machine_learning: code refactoring to GetPoseLandmarkDetectionResults function

author Inki Dae <inki.dae@samsung.com>

Mon, 27 Sep 2021 09:27:31 +0000 (18:27 +0900)

committer Inki Dae <inki.dae@samsung.com>

Wed, 29 Sep 2021 06:00:26 +0000 (15:00 +0900)
author Inki Dae <inki.dae@samsung.com>
Mon, 27 Sep 2021 09:27:31 +0000 (18:27 +0900)
committer Inki Dae <inki.dae@samsung.com>
Wed, 29 Sep 2021 06:00:26 +0000 (15:00 +0900)
diff --git a/include/mv_inference_private.h b/include/mv_inference_private.h

index 0d8a35e..7bcd315 100644 (file)
--- a/include/mv_inference_private.h
+++ b/include/mv_inference_private.h
@@ -19,6 +19,9 @@
  
  #include <mv_common.h>
  
+#define MAX_NUMBER_OF_POSE     5
+#define MAX_NUMBER_OF_LANDMARKS_PER_POSE       500
+
  #ifdef __cplusplus
  extern "C" {
  #endif /* __cplusplus */
@@ -58,7 +61,7 @@ typedef struct mv_inference_landmark_s{
  typedef struct {
      int number_of_poses;
      int number_of_landmarks_per_pose;            /**< The number of landmarks*/
-    mv_inference_landmark_s **landmarks; /**< A set of landmarks describing pose */
+    mv_inference_landmark_s landmarks[MAX_NUMBER_OF_POSE][MAX_NUMBER_OF_LANDMARKS_PER_POSE]; /**< A set of landmarks describing pose */
  } mv_inference_pose_s;
  
  /**
diff --git a/mv_machine_learning/mv_inference/inference/include/Inference.h b/mv_machine_learning/mv_inference/inference/include/Inference.h

index 1b65421..969b5a2 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/include/Inference.h
+++ b/mv_machine_learning/mv_inference/inference/include/Inference.h
@@ -326,7 +326,7 @@ namespace inference
                  * @since_tizen 6.0
                  * @return @c true on success, otherwise a negative error value
                  */
-               int GetPoseLandmarkDetectionResults(mv_inference_pose_result_h *detectionResults,
+               int GetPoseLandmarkDetectionResults(std::unique_ptr<mv_inference_pose_s> &detectionResults,
                                                                                 int width, int height);
  
                 mv_engine_config_h GetEngineConfig(void)
@@ -364,8 +364,6 @@ namespace inference
                 TensorBuffer mOutputTensorBuffers;
                 inference_engine_layer_property mOutputLayerProperty;
  
-               mv_inference_pose_s *mPoseResult;
-
                 Metadata mMetadata;
                 PreProcess mPreProc;
                 PostProcess mPostProc;
diff --git a/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h

index 11289ac..4960d68 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h
+++ b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h
@@ -26,8 +26,6 @@
  #include "OutputMetadata.h"
  #include "Landmark.h"
  
-#define MAX_NUMBER_OF_POSE 5
-
  /**
   * @file PoseDecoder.h
   * @brief This file contains the PoseDecoder class definition which
diff --git a/mv_machine_learning/mv_inference/inference/src/Inference.cpp b/mv_machine_learning/mv_inference/inference/src/Inference.cpp

index 7194736..f5d5488 100755 (executable)
--- a/mv_machine_learning/mv_inference/inference/src/Inference.cpp
+++ b/mv_machine_learning/mv_inference/inference/src/Inference.cpp
@@ -76,7 +76,6 @@ namespace inference
                         mSourceSize(cv::Size()),
                         engine_config(),
                         mBackend(),
-                       mPoseResult(NULL),
                         mMetadata(),
                         mPreProc(),
                         mPostProc()
@@ -136,14 +135,6 @@ namespace inference
                                         mOutputLayerProperty.layers);
                 }
  
-               if (mPoseResult) {
-                       for (int poseIndex = 0; poseIndex < mPoseResult->number_of_poses; ++poseIndex) {
-                               delete [] mPoseResult->landmarks[poseIndex];
-                       }
-                       delete [] mPoseResult->landmarks;
-                       delete mPoseResult;
-               }
-
                 mModelFormats.clear();
  
                 // Release backend engine.
@@ -1652,13 +1643,17 @@ namespace inference
         }
  
         int Inference::GetPoseLandmarkDetectionResults(
-                       mv_inference_pose_result_h *detectionResults, int width, int height)
+                       std::unique_ptr<mv_inference_pose_s> &detectionResults, int width, int height)
         {
                 LOGI("ENTER");
+
                 OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               auto poseResult = std::make_unique<mv_inference_pose_s>();
+
                 if (outputMeta.IsParsed()) {
                         auto& landmarkInfo = outputMeta.GetLandmark();
                         auto& scoreInfo = outputMeta.GetScore();
+
                         if (!mOutputTensorBuffers.exist(landmarkInfo.GetName()) ||
                                 !mOutputTensorBuffers.exist(scoreInfo.GetName())) {
                                 LOGE("output buffers named of %s or %s are NULL",
@@ -1669,6 +1664,7 @@ namespace inference
                         int heatMapWidth = 0;
                         int heatMapHeight = 0;
                         int heatMapChannel = 0;
+
                         if (landmarkInfo.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                 heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
                                 heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
@@ -1677,42 +1673,31 @@ namespace inference
  
                         LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
  
-                       if (mPoseResult == NULL) {
-                               mPoseResult = new(std::nothrow) mv_inference_pose_s;
-                               if (mPoseResult == NULL) {
-                                       LOGE("Fail to create result handle");
-                                       return MEDIA_VISION_ERROR_INTERNAL;
-                               }
-                               // 2d+single or 2d+multi or 3d+single or 3d+multi
-                               int defaultNumberOfPose = (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                                                                                  landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) ? 1 : MAX_NUMBER_OF_POSE;
-                               std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
-
-                               // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
-                               // the landmarkChannel is guessed from the shape of the landmark output tensor.
-                               // Otherwise, it is guessed from the heatMapChannel.
-                               int landmarkChannel = 0;
-                               if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
-                                       landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
-                                                                               / landmarkInfo.GetOffset();
-                               } else {
-                                       landmarkChannel = heatMapChannel;
-                               }
+                       std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
  
-                               mPoseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
-                                                                                                                       static_cast<int>(mUserListName.size());
+                       // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
+                       // the landmarkChannel is guessed from the shape of the landmark output tensor.
+                       // Otherwise, it is guessed from the heatMapChannel. (heatMapChannel is used in default).
+                       int landmarkChannel = heatMapChannel;
  
-                               LOGE("number of landmarks per pose: %d", mPoseResult->number_of_landmarks_per_pose );
-                               mPoseResult->landmarks = new mv_inference_landmark_s* [defaultNumberOfPose];
-                               for (int idx = 0; idx < defaultNumberOfPose; ++idx) {
-                                       mPoseResult->landmarks[idx] = new mv_inference_landmark_s [mPoseResult->number_of_landmarks_per_pose];
-                               }
+                       if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS)
+                               landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]] / landmarkInfo.GetOffset();
+
+                       poseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
+                                                                                                               static_cast<int>(mUserListName.size());
+
+                       LOGE("number of landmarks per pose: %d", poseResult->number_of_landmarks_per_pose );
+
+                       if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
+                               LOGE("Exceeded maxinum number of landmarks per pose(%d >= %d).",
+                                       poseResult->number_of_landmarks_per_pose, MAX_NUMBER_OF_LANDMARKS_PER_POSE);
+                               return MEDIA_VISION_ERROR_INVALID_PARAMETER;
                         }
  
                         // decoding
                         PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
                                                                         heatMapWidth, heatMapHeight, heatMapChannel,
-                                                                       mPoseResult->number_of_landmarks_per_pose);
+                                                                       poseResult->number_of_landmarks_per_pose);
  
                         // initialize decorder queue with landmarks to be decoded.
                         int ret = poseDecoder.init();
@@ -1733,9 +1718,9 @@ namespace inference
                         poseDecoder.decode(inputW, inputH, thresRadius);
  
                         int part = 0;
-                       mPoseResult->number_of_poses = poseDecoder.getNumberOfPose();
-                       for (int poseIndex = 0; poseIndex < mPoseResult->number_of_poses; ++poseIndex) {
-                               for (int landmarkIndex = 0; landmarkIndex < mPoseResult->number_of_landmarks_per_pose; ++ landmarkIndex) {
+                       poseResult->number_of_poses = poseDecoder.getNumberOfPose();
+                       for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; ++ landmarkIndex) {
                                         part = landmarkIndex;
                                         if (!mUserListName.empty()) {
                                                 part = std::stoi(mUserListName[landmarkIndex]) - 1;
@@ -1744,18 +1729,18 @@ namespace inference
                                                 }
                                         }
  
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.x =
                                                                                 poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.y =
                                                                                 poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].score =
                                                                                 poseDecoder.getScore(poseIndex, part);
                                 }
                         }
-                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
  
+                       detectionResults = std::move(poseResult);
                 } else {
                         tensor_t outputData;
  
@@ -1766,47 +1751,27 @@ namespace inference
                                         return ret;
                         }
  
-                       long number_of_poses = 1;
-                       long number_of_landmarks = outputData.dimInfo[0][3];
-                       float *tmp = static_cast<float *>(outputData.data[0]);
-                       cv::Size heatMapSize(outputData.dimInfo[0][1], outputData.dimInfo[0][2]);
-
-                       cv::Point loc;
-                       cv::Point2f loc2f;
-                       double score;
-                       cv::Mat blurredHeatMap;
-
                         cv::Mat reShapeTest(cv::Size(outputData.dimInfo[0][2], outputData.dimInfo[0][1]),
-                                                                                                       CV_32FC(outputData.dimInfo[0][3]), (void *) tmp);
-
+                                                                                CV_32FC(outputData.dimInfo[0][3]), outputData.data[0]);
                         cv::Mat multiChannels[outputData.dimInfo[0][3]];
+
                         split(reShapeTest, multiChannels);
  
                         float ratioX = static_cast<float>(outputData.dimInfo[0][2]);
                         float ratioY = static_cast<float>(outputData.dimInfo[0][1]);
  
-                       if (mPoseResult == NULL) {
-                               if(!mUserListName.empty()) {
-                                       number_of_landmarks = mUserListName.size();
-                               }
-                               mPoseResult = new mv_inference_pose_s;
-                               if (mPoseResult == NULL) {
-                                       LOGE("Fail to create result handle");
-                                       return MEDIA_VISION_ERROR_INTERNAL;
-                               }
+                       poseResult->number_of_poses = 1;
+                       poseResult->number_of_landmarks_per_pose = outputData.dimInfo[0][3];
  
-                               mPoseResult->number_of_poses= number_of_poses;
-                               mPoseResult->number_of_landmarks_per_pose = number_of_landmarks;
-                               mPoseResult->landmarks = new mv_inference_landmark_s*[number_of_poses];
-                               for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                                       mPoseResult->landmarks[poseIndex] = new mv_inference_landmark_s[number_of_landmarks];
-                               }
+                       if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
+                               LOGE("Exeeded maxinum number of landmarks per pose(%d >= %d).",
+                                       poseResult->number_of_landmarks_per_pose, MAX_NUMBER_OF_LANDMARKS_PER_POSE);
+                               return MEDIA_VISION_ERROR_INVALID_PARAMETER;
                         }
  
-                       int part = 0;
-                       for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                               for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
-                                       part = landmarkIndex;
+                       for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; landmarkIndex++) {
+                                       int part = landmarkIndex;
                                         if (!mUserListName.empty()) {
                                                 part = std::stoi(mUserListName[landmarkIndex]) - 1;
                                                 if (part < 0) {
@@ -1815,6 +1780,10 @@ namespace inference
                                         }
  
                                         cv::Mat heatMap = multiChannels[part];
+                                       double score;
+                                       cv::Point loc;
+                                       cv::Point2f loc2f;
+                                       cv::Mat blurredHeatMap;
  
                                         cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
                                         cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
@@ -1824,14 +1793,15 @@ namespace inference
                                         LOGI("landmarkIndex[%2d] - mapping to [%2d]: x[%.3f], y[%.3f], score[%.3f]",
                                                         landmarkIndex, part, loc2f.x, loc2f.y, score);
  
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x = static_cast<int>(static_cast<float>(width) * loc2f.x);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y = static_cast<int>(static_cast<float>(height) * loc2f.y);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score = score;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = -1;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.x = static_cast<int>(static_cast<float>(width) * loc2f.x);
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.y = static_cast<int>(static_cast<float>(height) * loc2f.y);
+                                       poseResult->landmarks[poseIndex][landmarkIndex].score = score;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].label = -1;
                                 }
                         }
-                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
+
+                       detectionResults = std::move(poseResult);
                 }
  
                 return MEDIA_VISION_ERROR_NONE;
diff --git a/mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp b/mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp

index 466b368..ed09bb0 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp
+++ b/mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp
@@ -851,27 +851,25 @@ int mv_inference_pose_landmark_detect_open(
                 return ret;
         }
  
-       mv_inference_pose_result_h result = NULL;
+       std::unique_ptr<mv_inference_pose_s> pose;
  
         ret = pInfer->GetPoseLandmarkDetectionResults(
-                       &result, width, height);
+                       pose, width, height);
         if (ret != MEDIA_VISION_ERROR_NONE) {
                 LOGE("Fail to get inference results");
                 return ret;
         }
  
-       mv_inference_pose_s *pose_obj = static_cast<mv_inference_pose_s *>(result);
-
-       for (int pose = 0; pose < pose_obj->number_of_poses; ++pose) {
-               for (int index = 0; index < pose_obj->number_of_landmarks_per_pose; ++index) {
-                       LOGI("PoseIdx[%2d]: x[%d], y[%d], score[%.3f]", index,
-                                                                               pose_obj->landmarks[pose][index].point.x,
-                                                                               pose_obj->landmarks[pose][index].point.y,
-                                                                               pose_obj->landmarks[pose][index].score);
+       for (int pose_index = 0; pose_index < pose->number_of_poses; ++pose_index) {
+               for (int landmark_index = 0; landmark_index < pose->number_of_landmarks_per_pose; ++landmark_index) {
+                       LOGI("PoseIdx[%2d]: x[%d], y[%d], score[%.3f]", landmark_index,
+                                                                               pose->landmarks[pose_index][landmark_index].point.x,
+                                                                               pose->landmarks[pose_index][landmark_index].point.y,
+                                                                               pose->landmarks[pose_index][landmark_index].score);
                 }
         }
  
-       detected_cb(source, result, user_data);
+       detected_cb(source, static_cast<mv_inference_pose_result_h>(pose.get()), user_data);
  
         return ret;
  }
diff --git a/packaging/capi-media-vision.spec b/packaging/capi-media-vision.spec

index 511e837..ee7e44e 100644 (file)
--- a/packaging/capi-media-vision.spec
+++ b/packaging/capi-media-vision.spec
@@ -1,6 +1,6 @@
  Name:        capi-media-vision
  Summary:     Media Vision library for Tizen Native API
-Version:     0.8.16
+Version:     0.8.17
  Release:     1
  Group:       Multimedia/Framework
  License:     Apache-2.0 and BSD-3-Clause
author	Inki Dae <inki.dae@samsung.com>
	Mon, 27 Sep 2021 09:27:31 +0000 (18:27 +0900)
committer	Inki Dae <inki.dae@samsung.com>
	Wed, 29 Sep 2021 06:00:26 +0000 (15:00 +0900)
include/mv_inference_private.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/include/Inference.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/include/PoseDecoder.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/src/Inference.cpp		patch \| blob \| history
mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp		patch \| blob \| history
packaging/capi-media-vision.spec		patch \| blob \| history