mv_machine_learning: code refactoring to GetPoseLandmarkDetectionResults function 29/264729/5
authorInki Dae <inki.dae@samsung.com>
Mon, 27 Sep 2021 09:27:31 +0000 (18:27 +0900)
committerInki Dae <inki.dae@samsung.com>
Wed, 29 Sep 2021 06:00:26 +0000 (15:00 +0900)
Did code refactoring to GetPoseLandmarkDetectionResults function of
Inference class by dropping Inference class dependency to mPoseResult object.

What this path did are,
 - use smart pointer instead of new operator, and make postResult
   object to be valid only in mv_inference_pose_landmark_detect_open
   function context.
 - drop some unnecessary local variables.
 - and code sliding.

The biggest change of this patch is to drop Inference class dependency
of mPoseResult object for next code refactoring. As as result,
it drops one brace from GetPoseLandmarkDetectionResults function and makes
the function to be more simplified for code readability enhancement.

Change-Id: I25d1a175571d407404c088713a14e485496b2920
Signed-off-by: Inki Dae <inki.dae@samsung.com>
include/mv_inference_private.h
mv_machine_learning/mv_inference/inference/include/Inference.h
mv_machine_learning/mv_inference/inference/include/PoseDecoder.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp
packaging/capi-media-vision.spec

index 0d8a35e..7bcd315 100644 (file)
@@ -19,6 +19,9 @@
 
 #include <mv_common.h>
 
+#define MAX_NUMBER_OF_POSE     5
+#define MAX_NUMBER_OF_LANDMARKS_PER_POSE       500
+
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
@@ -58,7 +61,7 @@ typedef struct mv_inference_landmark_s{
 typedef struct {
     int number_of_poses;
     int number_of_landmarks_per_pose;            /**< The number of landmarks*/
-    mv_inference_landmark_s **landmarks; /**< A set of landmarks describing pose */
+    mv_inference_landmark_s landmarks[MAX_NUMBER_OF_POSE][MAX_NUMBER_OF_LANDMARKS_PER_POSE]; /**< A set of landmarks describing pose */
 } mv_inference_pose_s;
 
 /**
index 1b65421..969b5a2 100644 (file)
@@ -326,7 +326,7 @@ namespace inference
                 * @since_tizen 6.0
                 * @return @c true on success, otherwise a negative error value
                 */
-               int GetPoseLandmarkDetectionResults(mv_inference_pose_result_h *detectionResults,
+               int GetPoseLandmarkDetectionResults(std::unique_ptr<mv_inference_pose_s> &detectionResults,
                                                                                int width, int height);
 
                mv_engine_config_h GetEngineConfig(void)
@@ -364,8 +364,6 @@ namespace inference
                TensorBuffer mOutputTensorBuffers;
                inference_engine_layer_property mOutputLayerProperty;
 
-               mv_inference_pose_s *mPoseResult;
-
                Metadata mMetadata;
                PreProcess mPreProc;
                PostProcess mPostProc;
index 11289ac..4960d68 100644 (file)
@@ -26,8 +26,6 @@
 #include "OutputMetadata.h"
 #include "Landmark.h"
 
-#define MAX_NUMBER_OF_POSE 5
-
 /**
  * @file PoseDecoder.h
  * @brief This file contains the PoseDecoder class definition which
index 7194736..f5d5488 100755 (executable)
@@ -76,7 +76,6 @@ namespace inference
                        mSourceSize(cv::Size()),
                        engine_config(),
                        mBackend(),
-                       mPoseResult(NULL),
                        mMetadata(),
                        mPreProc(),
                        mPostProc()
@@ -136,14 +135,6 @@ namespace inference
                                        mOutputLayerProperty.layers);
                }
 
-               if (mPoseResult) {
-                       for (int poseIndex = 0; poseIndex < mPoseResult->number_of_poses; ++poseIndex) {
-                               delete [] mPoseResult->landmarks[poseIndex];
-                       }
-                       delete [] mPoseResult->landmarks;
-                       delete mPoseResult;
-               }
-
                mModelFormats.clear();
 
                // Release backend engine.
@@ -1652,13 +1643,17 @@ namespace inference
        }
 
        int Inference::GetPoseLandmarkDetectionResults(
-                       mv_inference_pose_result_h *detectionResults, int width, int height)
+                       std::unique_ptr<mv_inference_pose_s> &detectionResults, int width, int height)
        {
                LOGI("ENTER");
+
                OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               auto poseResult = std::make_unique<mv_inference_pose_s>();
+
                if (outputMeta.IsParsed()) {
                        auto& landmarkInfo = outputMeta.GetLandmark();
                        auto& scoreInfo = outputMeta.GetScore();
+
                        if (!mOutputTensorBuffers.exist(landmarkInfo.GetName()) ||
                                !mOutputTensorBuffers.exist(scoreInfo.GetName())) {
                                LOGE("output buffers named of %s or %s are NULL",
@@ -1669,6 +1664,7 @@ namespace inference
                        int heatMapWidth = 0;
                        int heatMapHeight = 0;
                        int heatMapChannel = 0;
+
                        if (landmarkInfo.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
                                heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
@@ -1677,42 +1673,31 @@ namespace inference
 
                        LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
 
-                       if (mPoseResult == NULL) {
-                               mPoseResult = new(std::nothrow) mv_inference_pose_s;
-                               if (mPoseResult == NULL) {
-                                       LOGE("Fail to create result handle");
-                                       return MEDIA_VISION_ERROR_INTERNAL;
-                               }
-                               // 2d+single or 2d+multi or 3d+single or 3d+multi
-                               int defaultNumberOfPose = (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                                                                                  landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) ? 1 : MAX_NUMBER_OF_POSE;
-                               std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
-
-                               // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
-                               // the landmarkChannel is guessed from the shape of the landmark output tensor.
-                               // Otherwise, it is guessed from the heatMapChannel.
-                               int landmarkChannel = 0;
-                               if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
-                                       landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
-                                                                               / landmarkInfo.GetOffset();
-                               } else {
-                                       landmarkChannel = heatMapChannel;
-                               }
+                       std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
 
-                               mPoseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
-                                                                                                                       static_cast<int>(mUserListName.size());
+                       // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
+                       // the landmarkChannel is guessed from the shape of the landmark output tensor.
+                       // Otherwise, it is guessed from the heatMapChannel. (heatMapChannel is used in default).
+                       int landmarkChannel = heatMapChannel;
 
-                               LOGE("number of landmarks per pose: %d", mPoseResult->number_of_landmarks_per_pose );
-                               mPoseResult->landmarks = new mv_inference_landmark_s* [defaultNumberOfPose];
-                               for (int idx = 0; idx < defaultNumberOfPose; ++idx) {
-                                       mPoseResult->landmarks[idx] = new mv_inference_landmark_s [mPoseResult->number_of_landmarks_per_pose];
-                               }
+                       if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS)
+                               landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]] / landmarkInfo.GetOffset();
+
+                       poseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
+                                                                                                               static_cast<int>(mUserListName.size());
+
+                       LOGE("number of landmarks per pose: %d", poseResult->number_of_landmarks_per_pose );
+
+                       if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
+                               LOGE("Exceeded maxinum number of landmarks per pose(%d >= %d).",
+                                       poseResult->number_of_landmarks_per_pose, MAX_NUMBER_OF_LANDMARKS_PER_POSE);
+                               return MEDIA_VISION_ERROR_INVALID_PARAMETER;
                        }
 
                        // decoding
                        PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
                                                                        heatMapWidth, heatMapHeight, heatMapChannel,
-                                                                       mPoseResult->number_of_landmarks_per_pose);
+                                                                       poseResult->number_of_landmarks_per_pose);
 
                        // initialize decorder queue with landmarks to be decoded.
                        int ret = poseDecoder.init();
@@ -1733,9 +1718,9 @@ namespace inference
                        poseDecoder.decode(inputW, inputH, thresRadius);
 
                        int part = 0;
-                       mPoseResult->number_of_poses = poseDecoder.getNumberOfPose();
-                       for (int poseIndex = 0; poseIndex < mPoseResult->number_of_poses; ++poseIndex) {
-                               for (int landmarkIndex = 0; landmarkIndex < mPoseResult->number_of_landmarks_per_pose; ++ landmarkIndex) {
+                       poseResult->number_of_poses = poseDecoder.getNumberOfPose();
+                       for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; ++ landmarkIndex) {
                                        part = landmarkIndex;
                                        if (!mUserListName.empty()) {
                                                part = std::stoi(mUserListName[landmarkIndex]) - 1;
@@ -1744,18 +1729,18 @@ namespace inference
                                                }
                                        }
 
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.x =
                                                                                poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.y =
                                                                                poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score =
+                                       poseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].score =
                                                                                poseDecoder.getScore(poseIndex, part);
                                }
                        }
-                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
 
+                       detectionResults = std::move(poseResult);
                } else {
                        tensor_t outputData;
 
@@ -1766,47 +1751,27 @@ namespace inference
                                        return ret;
                        }
 
-                       long number_of_poses = 1;
-                       long number_of_landmarks = outputData.dimInfo[0][3];
-                       float *tmp = static_cast<float *>(outputData.data[0]);
-                       cv::Size heatMapSize(outputData.dimInfo[0][1], outputData.dimInfo[0][2]);
-
-                       cv::Point loc;
-                       cv::Point2f loc2f;
-                       double score;
-                       cv::Mat blurredHeatMap;
-
                        cv::Mat reShapeTest(cv::Size(outputData.dimInfo[0][2], outputData.dimInfo[0][1]),
-                                                                                                       CV_32FC(outputData.dimInfo[0][3]), (void *) tmp);
-
+                                                                                CV_32FC(outputData.dimInfo[0][3]), outputData.data[0]);
                        cv::Mat multiChannels[outputData.dimInfo[0][3]];
+
                        split(reShapeTest, multiChannels);
 
                        float ratioX = static_cast<float>(outputData.dimInfo[0][2]);
                        float ratioY = static_cast<float>(outputData.dimInfo[0][1]);
 
-                       if (mPoseResult == NULL) {
-                               if(!mUserListName.empty()) {
-                                       number_of_landmarks = mUserListName.size();
-                               }
-                               mPoseResult = new mv_inference_pose_s;
-                               if (mPoseResult == NULL) {
-                                       LOGE("Fail to create result handle");
-                                       return MEDIA_VISION_ERROR_INTERNAL;
-                               }
+                       poseResult->number_of_poses = 1;
+                       poseResult->number_of_landmarks_per_pose = outputData.dimInfo[0][3];
 
-                               mPoseResult->number_of_poses= number_of_poses;
-                               mPoseResult->number_of_landmarks_per_pose = number_of_landmarks;
-                               mPoseResult->landmarks = new mv_inference_landmark_s*[number_of_poses];
-                               for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                                       mPoseResult->landmarks[poseIndex] = new mv_inference_landmark_s[number_of_landmarks];
-                               }
+                       if (poseResult->number_of_landmarks_per_pose >= MAX_NUMBER_OF_LANDMARKS_PER_POSE) {
+                               LOGE("Exeeded maxinum number of landmarks per pose(%d >= %d).",
+                                       poseResult->number_of_landmarks_per_pose, MAX_NUMBER_OF_LANDMARKS_PER_POSE);
+                               return MEDIA_VISION_ERROR_INVALID_PARAMETER;
                        }
 
-                       int part = 0;
-                       for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                               for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
-                                       part = landmarkIndex;
+                       for (int poseIndex = 0; poseIndex < poseResult->number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < poseResult->number_of_landmarks_per_pose; landmarkIndex++) {
+                                       int part = landmarkIndex;
                                        if (!mUserListName.empty()) {
                                                part = std::stoi(mUserListName[landmarkIndex]) - 1;
                                                if (part < 0) {
@@ -1815,6 +1780,10 @@ namespace inference
                                        }
 
                                        cv::Mat heatMap = multiChannels[part];
+                                       double score;
+                                       cv::Point loc;
+                                       cv::Point2f loc2f;
+                                       cv::Mat blurredHeatMap;
 
                                        cv::GaussianBlur(heatMap, blurredHeatMap, cv::Size(), 5.0, 5.0);
                                        cv::minMaxLoc(heatMap, NULL, &score, NULL, &loc);
@@ -1824,14 +1793,15 @@ namespace inference
                                        LOGI("landmarkIndex[%2d] - mapping to [%2d]: x[%.3f], y[%.3f], score[%.3f]",
                                                        landmarkIndex, part, loc2f.x, loc2f.y, score);
 
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x = static_cast<int>(static_cast<float>(width) * loc2f.x);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y = static_cast<int>(static_cast<float>(height) * loc2f.y);
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score = score;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = -1;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.x = static_cast<int>(static_cast<float>(width) * loc2f.x);
+                                       poseResult->landmarks[poseIndex][landmarkIndex].point.y = static_cast<int>(static_cast<float>(height) * loc2f.y);
+                                       poseResult->landmarks[poseIndex][landmarkIndex].score = score;
+                                       poseResult->landmarks[poseIndex][landmarkIndex].label = -1;
                                }
                        }
-                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
+
+                       detectionResults = std::move(poseResult);
                }
 
                return MEDIA_VISION_ERROR_NONE;
index 466b368..ed09bb0 100644 (file)
@@ -851,27 +851,25 @@ int mv_inference_pose_landmark_detect_open(
                return ret;
        }
 
-       mv_inference_pose_result_h result = NULL;
+       std::unique_ptr<mv_inference_pose_s> pose;
 
        ret = pInfer->GetPoseLandmarkDetectionResults(
-                       &result, width, height);
+                       pose, width, height);
        if (ret != MEDIA_VISION_ERROR_NONE) {
                LOGE("Fail to get inference results");
                return ret;
        }
 
-       mv_inference_pose_s *pose_obj = static_cast<mv_inference_pose_s *>(result);
-
-       for (int pose = 0; pose < pose_obj->number_of_poses; ++pose) {
-               for (int index = 0; index < pose_obj->number_of_landmarks_per_pose; ++index) {
-                       LOGI("PoseIdx[%2d]: x[%d], y[%d], score[%.3f]", index,
-                                                                               pose_obj->landmarks[pose][index].point.x,
-                                                                               pose_obj->landmarks[pose][index].point.y,
-                                                                               pose_obj->landmarks[pose][index].score);
+       for (int pose_index = 0; pose_index < pose->number_of_poses; ++pose_index) {
+               for (int landmark_index = 0; landmark_index < pose->number_of_landmarks_per_pose; ++landmark_index) {
+                       LOGI("PoseIdx[%2d]: x[%d], y[%d], score[%.3f]", landmark_index,
+                                                                               pose->landmarks[pose_index][landmark_index].point.x,
+                                                                               pose->landmarks[pose_index][landmark_index].point.y,
+                                                                               pose->landmarks[pose_index][landmark_index].score);
                }
        }
 
-       detected_cb(source, result, user_data);
+       detected_cb(source, static_cast<mv_inference_pose_result_h>(pose.get()), user_data);
 
        return ret;
 }
index 511e837..ee7e44e 100644 (file)
@@ -1,6 +1,6 @@
 Name:        capi-media-vision
 Summary:     Media Vision library for Tizen Native API
-Version:     0.8.16
+Version:     0.8.17
 Release:     1
 Group:       Multimedia/Framework
 License:     Apache-2.0 and BSD-3-Clause