Add pose landmark detection inference with outputmetadata 25/258525/3
authorTae-Young Chung <ty83.chung@samsung.com>
Thu, 20 May 2021 01:12:54 +0000 (10:12 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Wed, 30 Jun 2021 09:03:32 +0000 (18:03 +0900)
Support models which require heatmap decoding with additional refinement
as well as models which just provide landmark results.

Change-Id: Ic993510d2655d488ea8a43e08a56c13d2f9bc94f
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
include/mv_inference_private.h
mv_machine_learning/mv_inference/inference/include/Inference.h
mv_machine_learning/mv_inference/inference/include/PoseDecoder.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp

index 95dff31..0d8a35e 100644 (file)
@@ -34,11 +34,19 @@ extern "C" {
   * @since_tizen 6.0
   *
   */
-typedef struct {
+typedef struct mv_inference_landmark_s{
     bool isAvailable;   /**< Availability of landmark */
     mv_point_s point;   /**< 2D position of landmark */
     int label;    /**< Label of landmark */
     float score;        /**< Score of landmark */
+
+    mv_inference_landmark_s() {
+      isAvailable = false;
+      point.x = -1;
+      point.y = -1;
+      label = -1;
+      score = -1.0f;
+    }
 } mv_inference_landmark_s;
 
 /**
index 6c88b95..3fea65d 100644 (file)
@@ -32,6 +32,8 @@
 #include "PostProcess.h"
 #include "TensorBuffer.h"
 
+#include "Landmark.h"
+
 #define HUMAN_POSE_MAX_LANDMARKS 16
 #define HUMAN_POSE_MAX_PARTS 6
 
index c910d62..545c385 100644 (file)
 #include <map>
 #include <list>
 
-
 #include "TensorBuffer.h"
 #include "OutputMetadata.h"
 #include "Landmark.h"
 
+#define MAX_NUMBER_OF_POSE 5
+
 /**
  * @file PoseDecoder.h
  * @brief This file contains the PoseDecoder class definition which
index 0a20d19..94582b3 100755 (executable)
@@ -18,7 +18,9 @@
 #include "Inference.h"
 #include "InferenceIni.h"
 #include "ObjectDecoder.h"
+#include "PoseDecoder.h"
 #include <map>
+#include <list>
 
 #include <unistd.h>
 #include <fstream>
@@ -471,6 +473,16 @@ namespace inference
 
                        if (!outputMeta.GetNumber().GetName().empty())
                                mConfig.mOutputLayerNames.push_back(outputMeta.GetNumber().GetName());
+
+                       if (!outputMeta.GetLandmark().GetName().empty())
+                               mConfig.mOutputLayerNames.push_back(outputMeta.GetLandmark().GetName());
+
+                       if (!outputMeta.GetOffset().GetName().empty())
+                               mConfig.mOutputLayerNames.push_back(outputMeta.GetOffset().GetName());
+
+                       for (auto& dispVec : outputMeta.GetDispVecAll()) {
+                               mConfig.mOutputLayerNames.push_back(dispVec.GetName());
+                       }
                }
 
                inference_engine_layer_property property;
@@ -480,6 +492,7 @@ namespace inference
                                                        INFERENCE_TENSOR_DATA_TYPE_FLOAT32,
                                                        1};
                for (auto& name : mConfig.mOutputLayerNames) {
+                       LOGI("Configure %s layer as output", name.c_str());
                        property.layers.insert(std::make_pair(name, tensor_info));
                }
 
@@ -1665,66 +1678,155 @@ namespace inference
        int Inference::GetPoseLandmarkDetectionResults(
                        mv_inference_pose_result_h *detectionResults, int width, int height)
        {
-               tensor_t outputData;
+               LOGI("ENTER");
+               OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               if (outputMeta.IsParsed()) {
+                       auto& landmarkInfo = outputMeta.GetLandmark();
+                       auto& scoreInfo = outputMeta.GetScore();
+                       if (!mOutputTensorBuffers.exist(landmarkInfo.GetName()) ||
+                               !mOutputTensorBuffers.exist(scoreInfo.GetName())) {
+                               LOGE("output buffers named of %s or %s are NULL",
+                                       landmarkInfo.GetName().c_str(), scoreInfo.GetName().c_str());
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
 
-               // Get inference result and contain it to outputData.
-               int ret = FillOutputResult(outputData);
-               if (ret != MEDIA_VISION_ERROR_NONE) {
-                               LOGE("Fail to get output result.");
-                               return ret;
-               }
+                       int heatMapWidth = 0;
+                       int heatMapHeight = 0;
+                       int heatMapChannel = 0;
+                       if (landmarkInfo.GetDecodingType() != 0) {
+                               heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
+                               heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
+                               heatMapChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.cIdx];
+                       }
 
-               std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
-               std::vector<void *> inferResults(outputData.data.begin(),
-                                                                               outputData.data.end());
+                       LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
 
-               long number_of_poses = 1;
-               long number_of_landmarks = inferDimInfo[0][3];
-               float *tmp = static_cast<float *>(inferResults[0]);
-               cv::Size heatMapSize(inferDimInfo[0][1], inferDimInfo[0][2]);
+                       if (mPoseResult == NULL) {
+                               mPoseResult = new(std::nothrow) mv_inference_pose_s;
+                               if (mPoseResult == NULL) {
+                                       LOGE("Fail to create result handle");
+                                       return MEDIA_VISION_ERROR_INTERNAL;
+                               }
+                               // 2d+single or 2d+multi or 3d+single or 3d+multi
+                               int defaultNumberOfPose = (landmarkInfo.GetType() == 0 || landmarkInfo.GetType() == 2) ? 1 : MAX_NUMBER_OF_POSE;
+                               std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
+
+                               // In case of DecodingType == 0,
+                               // the landmarkChannel is guessed from the shape of the landmark output tensor.
+                               // Otherwise, decoding heatmap, it is guessed from the heatMapChannel.
+                               int landmarkChannel = 0;
+                               if (landmarkInfo.GetDecodingType() == 0) {
+                                       landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
+                                                                               / landmarkInfo.GetOffset();
+                               } else {
+                                       landmarkChannel = heatMapChannel;
+                               }
 
-               cv::Point loc;
-               cv::Point2f loc2f;
-               double score;
-               cv::Mat blurredHeatMap;
+                               mPoseResult->number_of_landmarks_per_pose = mUserListName.empty() ? landmarkChannel :
+                                                                                                                       static_cast<int>(mUserListName.size());
 
-               cv::Mat reShapeTest(cv::Size(inferDimInfo[0][2], inferDimInfo[0][1]),
-                                                                                               CV_32FC(inferDimInfo[0][3]), (void *) tmp);
+                               LOGE("number of landmarks per pose: %d", mPoseResult->number_of_landmarks_per_pose );
+                               mPoseResult->landmarks = new mv_inference_landmark_s* [defaultNumberOfPose];
+                               for (int idx = 0; idx < defaultNumberOfPose; ++idx) {
+                                       mPoseResult->landmarks[idx] = new mv_inference_landmark_s [mPoseResult->number_of_landmarks_per_pose];
+                               }
+                       }
+
+                       // decoding
+                       PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
+                                                                       heatMapWidth, heatMapHeight, heatMapChannel,
+                                                                       mPoseResult->number_of_landmarks_per_pose);
+
+                       // initialize decorder queue with landmarks to be decoded.
+                       int ret = poseDecoder.init();
+                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                               LOGE("Fail to init poseDecoder");
+                               return ret;
+                       }
+
+                       float inputW = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetWidth());
+                       float inputH = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetHeight());
+                       float thresRadius = landmarkInfo.GetType() == 0 ? 0.0 : outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
+                       poseDecoder.decode(inputW, inputH, thresRadius);
+
+                       int part = 0;
+                       mPoseResult->number_of_poses = poseDecoder.getNumberOfPose();
+                       for (int poseIndex = 0; poseIndex < mPoseResult->number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < mPoseResult->number_of_landmarks_per_pose; ++ landmarkIndex) {
+                                       part = landmarkIndex;
+                                       if (!mUserListName.empty()) {
+                                               part = std::stoi(mUserListName[landmarkIndex]) - 1;
+                                               if (part < 0) {
+                                                       continue;
+                                               }
+                                       }
 
-               cv::Mat multiChannels[inferDimInfo[0][3]];
-               split(reShapeTest, multiChannels);
+                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = true;
+                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x =
+                                                                               poseDecoder.getPointX(poseIndex, part) * static_cast<float>(mSourceSize.width);
+                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y =
+                                                                               poseDecoder.getPointY(poseIndex, part) * static_cast<float>(mSourceSize.height);
+                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = landmarkIndex;
+                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score =
+                                                                               poseDecoder.getScore(poseIndex, part);
+                               }
+                       }
+                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
 
-               float ratioX = static_cast<float>(inferDimInfo[0][2]);
-               float ratioY = static_cast<float>(inferDimInfo[0][1]);
+               } else {
+                       tensor_t outputData;
 
-               if (mPoseResult == NULL) {
-                       if(!mUserListName.empty()) {
-                               number_of_landmarks = mUserListName.size();
+                       // Get inference result and contain it to outputData.
+                       int ret = FillOutputResult(outputData);
+                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to get output result.");
+                                       return ret;
                        }
-                       mPoseResult = new mv_inference_pose_s;
+
+                       std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
+                       std::vector<void *> inferResults(outputData.data.begin(),
+                                                                                       outputData.data.end());
+
+                       long number_of_poses = 1;
+                       long number_of_landmarks = inferDimInfo[0][3];
+                       float *tmp = static_cast<float *>(inferResults[0]);
+                       cv::Size heatMapSize(inferDimInfo[0][1], inferDimInfo[0][2]);
+
+                       cv::Point loc;
+                       cv::Point2f loc2f;
+                       double score;
+                       cv::Mat blurredHeatMap;
+
+                       cv::Mat reShapeTest(cv::Size(inferDimInfo[0][2], inferDimInfo[0][1]),
+                                                                                                       CV_32FC(inferDimInfo[0][3]), (void *) tmp);
+
+                       cv::Mat multiChannels[inferDimInfo[0][3]];
+                       split(reShapeTest, multiChannels);
+
+                       float ratioX = static_cast<float>(inferDimInfo[0][2]);
+                       float ratioY = static_cast<float>(inferDimInfo[0][1]);
+
                        if (mPoseResult == NULL) {
-                               LOGE("Fail to create result handle");
-                               return MEDIA_VISION_ERROR_INTERNAL;
-                       }
+                               if(!mUserListName.empty()) {
+                                       number_of_landmarks = mUserListName.size();
+                               }
+                               mPoseResult = new mv_inference_pose_s;
+                               if (mPoseResult == NULL) {
+                                       LOGE("Fail to create result handle");
+                                       return MEDIA_VISION_ERROR_INTERNAL;
+                               }
 
-                       mPoseResult->number_of_poses= number_of_poses;
-                       mPoseResult->number_of_landmarks_per_pose = number_of_landmarks;
-                       mPoseResult->landmarks = new mv_inference_landmark_s*[number_of_poses];
-                       for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                               mPoseResult->landmarks[poseIndex] = new mv_inference_landmark_s[number_of_landmarks];
-                               for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; ++landmarkIndex) {
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].isAvailable = false;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.x = -1;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].point.y = -1;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].label = -1;
-                                       mPoseResult->landmarks[poseIndex][landmarkIndex].score = -1.0f;
+                               mPoseResult->number_of_poses= number_of_poses;
+                               mPoseResult->number_of_landmarks_per_pose = number_of_landmarks;
+                               mPoseResult->landmarks = new mv_inference_landmark_s*[number_of_poses];
+                               for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
+                                       mPoseResult->landmarks[poseIndex] = new mv_inference_landmark_s[number_of_landmarks];
                                }
                        }
-               }
 
-               int part = 0;
-               for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
-                       for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
+                       int part = 0;
+                       for (int poseIndex = 0; poseIndex < number_of_poses; ++poseIndex) {
+                               for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
                                        part = landmarkIndex;
                                        if (!mUserListName.empty()) {
                                                part = std::stoi(mUserListName[landmarkIndex]) - 1;
@@ -1748,11 +1850,11 @@ namespace inference
                                        mPoseResult->landmarks[poseIndex][landmarkIndex].point.y = static_cast<int>(static_cast<float>(height) * loc2f.y);
                                        mPoseResult->landmarks[poseIndex][landmarkIndex].score = score;
                                        mPoseResult->landmarks[poseIndex][landmarkIndex].label = -1;
+                               }
                        }
+                       *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
                }
 
-               *detectionResults = static_cast<mv_inference_pose_result_h>(mPoseResult);
-
                return MEDIA_VISION_ERROR_NONE;
        }
 
index f30fbf9..271f068 100644 (file)
@@ -22,7 +22,6 @@
 #include <fstream>
 #include <string>
 
-#define MAX_NUMBER_OF_POSE 5
 #define MAX_NUMBER_OF_CORRECTION 3
 
 namespace mediavision