Add facial landmark detection inference with outputmetadata 68/261268/9
authorTae-Young Chung <ty83.chung@samsung.com>
Wed, 14 Jul 2021 07:01:34 +0000 (16:01 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Fri, 16 Jul 2021 03:00:00 +0000 (03:00 +0000)
Change-Id: I03deac554f22ec9fe079b38d9562fd667b854495
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
mv_machine_learning/mv_inference/inference/include/Inference.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp
mv_machine_learning/mv_inference/inference/src/mv_inference_open.cpp

index 3fea65d..030f9ec 100644 (file)
@@ -310,7 +310,8 @@ namespace inference
                 * @since_tizen 5.5
                 * @return @c true on success, otherwise a negative error value
                 */
-               int GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *results);
+               int GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *results,
+                                                                               int width, int height);
 
                /**
                 * @brief       Gets the PoseLandmarkDetectionResults
index 3a2d20f..c7ca663 100755 (executable)
@@ -1635,41 +1635,101 @@ namespace inference
        }
 
        int Inference::GetFacialLandMarkDetectionResults(
-                       FacialLandMarkDetectionResults *detectionResults)
+                       FacialLandMarkDetectionResults *detectionResults, int width, int height)
        {
-               tensor_t outputData;
+               LOGI("ENTER");
+               FacialLandMarkDetectionResults results;
+               OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               if (outputMeta.IsParsed()) {
+                       auto& landmarkInfo = outputMeta.GetLandmark();
+                       auto& scoreInfo = outputMeta.GetScore();
+                       if (!mOutputTensorBuffers.exist(landmarkInfo.GetName()) ||
+                               !mOutputTensorBuffers.exist(scoreInfo.GetName())) {
+                               LOGE("output buffers named of %s or %s are NULL",
+                                       landmarkInfo.GetName().c_str(), scoreInfo.GetName().c_str());
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
 
-               // Get inference result and contain it to outputData.
-               int ret = FillOutputResult(outputData);
-               if (ret != MEDIA_VISION_ERROR_NONE) {
-                       LOGE("Fail to get output result.");
-                       return ret;
-               }
+                       int heatMapWidth = 0;
+                       int heatMapHeight = 0;
+                       int heatMapChannel = 0;
+                       if (landmarkInfo.GetDecodingType() != 0) {
+                               heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
+                               heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
+                               heatMapChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.cIdx];
+                       }
+
+                       int number_of_landmarks = 0;
+                       std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
+                       if (landmarkInfo.GetDecodingType() == 0) {
+                               LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
+                               number_of_landmarks = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
+                                                                       / landmarkInfo.GetOffset();
+                       } else {
+                               number_of_landmarks = heatMapChannel;
+                       }
+                       LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
+
+                       // decoding
+                       PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
+                                                                       heatMapWidth, heatMapHeight, heatMapChannel,
+                                                                       number_of_landmarks);
+                       // initialize decorder queue with landmarks to be decoded.
+                       int ret = poseDecoder.init();
+                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                               LOGE("Fail to init poseDecoder");
+                               return ret;
+                       }
 
-               std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
-               std::vector<void *> inferResults(outputData.data.begin(),
-                                                                                outputData.data.end());
+                       float inputW = 1.f;
+                       float inputH = 1.f;
+                       if (landmarkInfo.GetCoordinate() == 1) {
+                               inputW = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetWidth());
+                               inputH = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetHeight());
+                       }
+                       float thresRadius = landmarkInfo.GetType() == 0 ? 0.0 : outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
+                       poseDecoder.decode(inputW, inputH, thresRadius);
 
-               long number_of_detections = inferDimInfo[0][1];
-               float *loc = reinterpret_cast<float *>(inferResults[0]);
+                       for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
+                               results.locations.push_back(
+                                       cv::Point(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
+                                                         poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height)));
+                       }
+                       results.number_of_landmarks = results.locations.size();
+                       *detectionResults = results;
+               } else {
+                       tensor_t outputData;
 
-               FacialLandMarkDetectionResults results;
-               results.number_of_landmarks = 0;
+                       // Get inference result and contain it to outputData.
+                       int ret = FillOutputResult(outputData);
+                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                               LOGE("Fail to get output result.");
+                               return ret;
+                       }
+
+                       std::vector<std::vector<int> > inferDimInfo(outputData.dimInfo);
+                       std::vector<void *> inferResults(outputData.data.begin(),
+                                                                                       outputData.data.end());
 
-               cv::Point point(0, 0);
-               results.number_of_landmarks = 0;
-               LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
-               for (int idx = 0; idx < number_of_detections; idx += 2) {
-                       point.x = static_cast<int>(loc[idx] * mSourceSize.width);
-                       point.y = static_cast<int>(loc[idx + 1] * mSourceSize.height);
+                       long number_of_detections = inferDimInfo[0][1];
+                       float *loc = reinterpret_cast<float *>(inferResults[0]);
 
-                       results.locations.push_back(point);
-                       results.number_of_landmarks++;
+                       results.number_of_landmarks = 0;
 
-                       LOGI("x:%d, y:%d", point.x, point.y);
-               }
+                       cv::Point point(0, 0);
+                       LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
+                       for (int idx = 0; idx < number_of_detections; idx += 2) {
+                               point.x = static_cast<int>(loc[idx] * mSourceSize.width);
+                               point.y = static_cast<int>(loc[idx + 1] * mSourceSize.height);
+
+                               results.locations.push_back(point);
+                               results.number_of_landmarks++;
 
-               *detectionResults = results;
+                               LOGI("x:%d, y:%d", point.x, point.y);
+                       }
+
+                       *detectionResults = results;
+               }
                LOGE("Inference: FacialLandmarkDetectionResults: %d\n",
                         results.number_of_landmarks);
                return MEDIA_VISION_ERROR_NONE;
index 271f068..cce5143 100644 (file)
@@ -318,14 +318,16 @@ namespace inference
                } else {
                        // multi pose is not supported
                        std::vector<int> scoreIndexes = scoreInfo.GetDimInfo().GetValidIndexAll();
-                       float poseScore  = mTensorBuffer.getValue<float>(scoreInfo.GetName(), scoreIndexes[scoreIndexes[0]]);
-                       if (scoreInfo.GetType() == 1) {
-                               poseScore = PostProcess::sigmoid(poseScore);
-                       }
-                       if (poseScore < scoreInfo.GetThresHold()) {
-                               LOGI("pose score %.4f is lower than %.4f", poseScore, scoreInfo.GetThresHold());
-                               LOGI("LEAVE");
-                               return MEDIA_VISION_ERROR_NONE;
+                       float poseScore = scoreInfo.GetThresHold();
+                       if (!scoreIndexes.empty()) {
+                               poseScore  = mTensorBuffer.getValue<float>(scoreInfo.GetName(), scoreIndexes[scoreIndexes[0]]);
+                               if (scoreInfo.GetType() == 1) {
+                                       poseScore = PostProcess::sigmoid(poseScore);
+                               }
+                               if (poseScore < scoreInfo.GetThresHold()) {
+                                       LOGI("pose score %.4f is lower than %.4f\n[LEAVE]", poseScore, scoreInfo.GetThresHold());
+                                       return MEDIA_VISION_ERROR_NONE;
+                               }
                        }
 
                        int landmarkOffset = (landmarkInfo.GetType() == 0 || landmarkInfo.GetType() == 1) ? 2 : 3;
index 1c4eb7e..c6bb99a 100644 (file)
@@ -771,6 +771,19 @@ int mv_inference_facial_landmark_detect_open(
        std::vector<mv_source_h> sources;
        std::vector<mv_rectangle_s> rects;
 
+       unsigned int width, height;
+       ret = mv_source_get_width(source, &width);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to get width");
+               return ret;
+       }
+
+       ret = mv_source_get_height(source, &height);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to get height");
+               return ret;
+       }
+
        sources.push_back(source);
 
        if (roi != NULL)
@@ -784,7 +797,7 @@ int mv_inference_facial_landmark_detect_open(
 
        FacialLandMarkDetectionResults facialLandMarkDetectionResults;
        ret = pInfer->GetFacialLandMarkDetectionResults(
-                       &facialLandMarkDetectionResults);
+                       &facialLandMarkDetectionResults, width, height);
        if (ret != MEDIA_VISION_ERROR_NONE) {
                LOGE("Fail to get inference results");
                return ret;