Update Landmark to get multiple outputs 12/279712/2 sandbox/testsuite
authorTae-Young Chung <ty83.chung@samsung.com>
Wed, 17 Aug 2022 00:37:32 +0000 (09:37 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Wed, 17 Aug 2022 01:06:34 +0000 (10:06 +0900)
In case of mediapipe's face with attention model,
it outputs multiple landmarks such as facial landmark,
eyes, lips, iris.

To support that, Landmarks is changed to get multiple
landmark outputs.

Change-Id: Ia871a53b7e82af27b367ee6bde3f295e27e98729
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
mv_machine_learning/mv_inference/inference/include/Landmark.h
mv_machine_learning/mv_inference/inference/include/OutputMetadata.h
mv_machine_learning/mv_inference/inference/include/PoseDecoder.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp
mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp
test/testsuites/machine_learning/inference/mv_facestream_test_suite.cpp

index c92e6ae160789fca26890d993843055b30382469..f7a792928706839852b8426525d4b9063de02072 100644 (file)
@@ -108,41 +108,32 @@ namespace inference
 
                int ParseLandmark(JsonObject *root)
                {
-                       // box
-                       JsonArray * rootArray = json_object_get_array_member(root, "landmark");
-                       unsigned int elements = json_array_get_length(rootArray);
+                       LOGI("ENTER");
 
-                       // TODO: handling error
-                       for (unsigned int elem = 0; elem < elements; ++elem) {
+                       name =
+                               static_cast<const char*>(json_object_get_string_member(root,"name"));
+                       LOGI("layer: %s", name.c_str());
 
-                               JsonNode *pNode = json_array_get_element(rootArray, elem);
-                               JsonObject *pObject = json_node_get_object(pNode);
+                       JsonArray * array = json_object_get_array_member(root, "index");
+                       unsigned int elements2 = json_array_get_length(array);
+                       LOGI("range dim: size[%u]", elements2);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
+                                       dimInfo.SetValidIndex(elem2);
+                       }
 
-                               name =
-                                       static_cast<const char*>(json_object_get_string_member(pObject,"name"));
-                               LOGI("layer: %s", name.c_str());
-
-                               JsonArray * array = json_object_get_array_member(pObject, "index");
-                               unsigned int elements2 = json_array_get_length(array);
-                               LOGI("range dim: size[%u]", elements2);
-                               for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
-                                       if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
-                                               dimInfo.SetValidIndex(elem2);
-                               }
-
-                               try {
-                                       type = GetSupportedType(pObject, "landmark_type", supportedLandmarkTypes);
-                                       coordinate = GetSupportedType(pObject, "landmark_coordinate", supportedLandmarkCoordinateTypes);
-                                       decodingType = GetSupportedType(pObject, "decoding_type", supportedLandmarkDecodingTypes);
-                               } catch (const std::exception& e) {
-                                       LOGE("Invalid %s", e.what());
-                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
-                               }
-
-                               offset = static_cast<int>(json_object_get_int_member(pObject, "landmark_offset"));
-                               LOGI("landmark offset: %d", offset);
+                       try {
+                               type = GetSupportedType(root, "landmark_type", supportedLandmarkTypes);
+                               coordinate = GetSupportedType(root, "landmark_coordinate", supportedLandmarkCoordinateTypes);
+                               decodingType = GetSupportedType(root, "decoding_type", supportedLandmarkDecodingTypes);
+                       } catch (const std::exception& e) {
+                               LOGE("Invalid %s", e.what());
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
 
+                       offset = static_cast<int>(json_object_get_int_member(root, "landmark_offset"));
+                       LOGI("landmark offset: %d", offset);
+
                        LOGI("LEAVE");
                        return MEDIA_VISION_ERROR_NONE;
                }
@@ -195,54 +186,36 @@ namespace inference
                {
                        LOGI("ENTER");
 
-                       // box
-                       JsonArray * rootArray = json_object_get_array_member(root, "landmark");
-                       unsigned int elements = json_array_get_length(rootArray);
+                       JsonObject *cObject = json_object_get_object_member(root, "decoding_info");
+                       if (!json_object_has_member(cObject, "heatmap")) {
+                               LOGE("heatmap is mandatory. Invalid metadata");
+                               LOGI("LEAVE");
 
-                       // TODO: handling error
-                       for (unsigned int elem = 0; elem < elements; ++elem) {
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
 
-                               JsonNode *pNode = json_array_get_element(rootArray, elem);
-                               JsonObject *pObject = json_node_get_object(pNode);
+                       JsonObject *object = json_object_get_object_member(cObject, "heatmap") ;
+                       try {
+                               GetHeatMapInfo().shapeType = GetSupportedType(object, "shape_type", supportedShapeType);
+                       } catch (const std::exception& e) {
+                               LOGE("Invalid %s", e.what());
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
+
+                       std::vector<int> heatMapIndexes = GetDimInfo().GetValidIndexAll();
+                       if (GetHeatMapInfo().shapeType == INFERENCE_TENSOR_SHAPE_NCHW) {
+                               GetHeatMapInfo().cIdx = heatMapIndexes[0];
+                               GetHeatMapInfo().hIdx = heatMapIndexes[1];
+                               GetHeatMapInfo().wIdx = heatMapIndexes[2];
+                       } else {
+                               GetHeatMapInfo().hIdx = heatMapIndexes[0];
+                               GetHeatMapInfo().wIdx = heatMapIndexes[1];
+                               GetHeatMapInfo().cIdx = heatMapIndexes[2];
+                       }
 
-                               if (!json_object_has_member(pObject, "decoding_info")) {
-                                       LOGE("decoding_info is mandatory. Invalid metadata");
-                                       LOGI("LEAVE");
-
-                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
-                               }
-
-                               JsonObject *cObject = json_object_get_object_member(pObject, "decoding_info");
-                               if (!json_object_has_member(cObject, "heatmap")) {
-                                       LOGE("heatmap is mandatory. Invalid metadata");
-                                       LOGI("LEAVE");
-
-                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
-                               }
-
-                               JsonObject *object = json_object_get_object_member(cObject, "heatmap") ;
-                               try {
-                                       GetHeatMapInfo().shapeType = GetSupportedType(object, "shape_type", supportedShapeType);
-                               } catch (const std::exception& e) {
-                                       LOGE("Invalid %s", e.what());
-                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
-                               }
-
-                               std::vector<int> heatMapIndexes = GetDimInfo().GetValidIndexAll();
-                               if (GetHeatMapInfo().shapeType == INFERENCE_TENSOR_SHAPE_NCHW) {
-                                       GetHeatMapInfo().cIdx = heatMapIndexes[0];
-                                       GetHeatMapInfo().hIdx = heatMapIndexes[1];
-                                       GetHeatMapInfo().wIdx = heatMapIndexes[2];
-                               } else {
-                                       GetHeatMapInfo().hIdx = heatMapIndexes[0];
-                                       GetHeatMapInfo().wIdx = heatMapIndexes[1];
-                                       GetHeatMapInfo().cIdx = heatMapIndexes[2];
-                               }
-
-                               if (json_object_has_member(object, "nms_radius")) {
-                                       GetHeatMapInfo().nmsRadius = static_cast<float>(json_object_get_double_member(object, "nms_radius"));
-                                       LOGI("nms is enabled with %3.f", GetHeatMapInfo().nmsRadius );
-                               }
+                       if (json_object_has_member(object, "nms_radius")) {
+                               GetHeatMapInfo().nmsRadius = static_cast<float>(json_object_get_double_member(object, "nms_radius"));
+                               LOGI("nms is enabled with %3.f", GetHeatMapInfo().nmsRadius );
                        }
 
                        LOGI("LEAVE");
index 9385aa7bf0fa23e04e20ec97989d834f5392a94b..402b7e21d7b56a74af40fc6bb7dd25d31a4c3243 100644 (file)
@@ -52,7 +52,7 @@ namespace inference
                bool parsed;
                ScoreInfo score;
                box::BoxInfo box;
-               Landmark landmark;
+               std::vector<Landmark> landmarks;
                OffsetVec offsetVec;
                std::map<std::string, inference_tensor_shape_type_e> mSupportedShapeType;
 
@@ -103,15 +103,16 @@ namespace inference
                DimInfo GetBoxNumberDimInfo() { return box.GetNumberDimInfo(); }
 
                int GetScoreCoordinate() { return box.GetCoordinate(); }
-               std::string GetLandmarkName() { return landmark.GetName(); }
-               int GetLandmarkOffset() { return landmark.GetOffset(); }
-               inference_landmark_type_e GetLandmarkType() { return landmark.GetType(); }
-               DimInfo GetLandmarkDimInfo() { return landmark.GetDimInfo(); }
-               HeatMapInfo& GetLandmarkHeatMapInfo() { return landmark.GetHeatMapInfo(); }
-               inference_landmark_coorindate_type_e GetLandmarkCoordinate() { return landmark.GetCoordinate(); }
-               inference_landmark_decoding_type_e GetLandmarkDecodingType() { return landmark.GetDecodingType(); }
-               std::vector<DispVec>& GetLandmarkDispVecAll() { return landmark.GetDispVecAll(); }
-               std::vector<std::pair<int, int>>& GetLandmarkEdges() { return landmark.GetEdges(); }
+               size_t GetLandmarksSize() { return landmarks.size(); }
+               std::string GetLandmarkName(int idx = 0) { return landmarks[idx].GetName(); }
+               int GetLandmarkOffset(int idx = 0) { return landmarks[idx].GetOffset(); }
+               inference_landmark_type_e GetLandmarkType(int idx = 0) { return landmarks[idx].GetType(); }
+               DimInfo GetLandmarkDimInfo(int idx = 0) { return landmarks[idx].GetDimInfo(); }
+               HeatMapInfo& GetLandmarkHeatMapInfo(int idx = 0) { return landmarks[idx].GetHeatMapInfo(); }
+               inference_landmark_coorindate_type_e GetLandmarkCoordinate(int idx = 0) { return landmarks[idx].GetCoordinate(); }
+               inference_landmark_decoding_type_e GetLandmarkDecodingType(int idx = 0) { return landmarks[idx].GetDecodingType(); }
+               std::vector<DispVec>& GetLandmarkDispVecAll(int idx = 0) { return landmarks[idx].GetDispVecAll(); }
+               std::vector<std::pair<int, int>>& GetLandmarkEdges(int idx = 0) { return landmarks[idx].GetEdges(); }
                std::string GetOffsetVecName() { return offsetVec.GetName(); }
                inference_box_decoding_type_e GetBoxDecodingType() { return box.GetDecodingType(); }
        };
index aaeb48ea1fbf5f7c23f6eb97ec6008f4bb688887..c1ea49324fb45aa04816ddef79bfc804e2d492c0 100644 (file)
@@ -45,6 +45,7 @@ namespace inference
                int mHeatMapHeight;
                int mHeatMapChannel;
                int mNumberOfLandmarks;
+               int mIdx;
 
                std::list<LandmarkPoint> mCandidates;
                std::vector<LandmarkResults> mPoseLandmarks;
@@ -66,12 +67,13 @@ namespace inference
        public:
                PoseDecoder(TensorBuffer& buffer, const OutputMetadata& metaData,
                                        int heatMapWidth, int heatMapHeight, int heatMapChannel,
-                                       int numberOfLandmarks) :
+                                       int numberOfLandmarks, int idx = 0) :
                                        mTensorBuffer(buffer),
                                        mHeatMapWidth(heatMapWidth),
                                        mHeatMapHeight(heatMapHeight),
                                        mHeatMapChannel(heatMapChannel),
-                                       mNumberOfLandmarks(numberOfLandmarks) {
+                                       mNumberOfLandmarks(numberOfLandmarks),
+                                       mIdx(idx) {
                                                mMeta = metaData;
                                        };
 
index 2a43efdd77e46db7142aa8d99b716626303ee3c1..31d2a9c0c37c106d5c0680f4b7ac323abc1e9d1e 100755 (executable)
@@ -463,15 +463,14 @@ namespace inference
                        if (!outputMeta.GetBoxNumberName().empty())
                                mConfig.mOutputLayerNames.push_back(outputMeta.GetBoxNumberName());
 
-                       if (!outputMeta.GetLandmarkName().empty())
-                               mConfig.mOutputLayerNames.push_back(outputMeta.GetLandmarkName());
+                       for (int idx = 0; idx < outputMeta.GetLandmarksSize(); idx++) {
+                               mConfig.mOutputLayerNames.push_back(outputMeta.GetLandmarkName(idx));
+                               for (auto& dispVec : outputMeta.GetLandmarkDispVecAll(idx))
+                                       mConfig.mOutputLayerNames.push_back(dispVec.GetName());
+                       }
 
                        if (!outputMeta.GetOffsetVecName().empty())
                                mConfig.mOutputLayerNames.push_back(outputMeta.GetOffsetVecName());
-
-                       for (auto& dispVec : outputMeta.GetLandmarkDispVecAll()) {
-                               mConfig.mOutputLayerNames.push_back(dispVec.GetName());
-                       }
                }
 
                inference_engine_layer_property property;
@@ -1532,56 +1531,59 @@ namespace inference
                                return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
 
-                       int heatMapWidth = 0;
-                       int heatMapHeight = 0;
-                       int heatMapChannel = 0;
-                       std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll();
-                       int number_of_landmarks = heatMapChannel;
-
-                       if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
-                               LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
-                               number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]]
-                                                                       / outputMeta.GetLandmarkOffset();
-                       } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
-                               number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]];
-                       } else {
-                               heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().wIdx];
-                               heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().hIdx];
-                               heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().cIdx];
-                       }
+                       for (int idx = 0; idx < outputMeta.GetLandmarksSize(); idx++) {
+                               LOGE("decode: %s", outputMeta.GetLandmarkName(idx).c_str());
+                               int heatMapWidth = 0;
+                               int heatMapHeight = 0;
+                               int heatMapChannel = 0;
+                               std::vector<int> channelIndexes = outputMeta.GetLandmarkDimInfo(idx).GetValidIndexAll();
+                               int number_of_landmarks = heatMapChannel;
+
+                               if (outputMeta.GetLandmarkDecodingType(idx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
+                                       LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
+                                       number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[channelIndexes[0]]
+                                                                               / outputMeta.GetLandmarkOffset(idx);
+                               } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
+                                       number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[channelIndexes[0]];
+                               } else {
+                                       heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).wIdx];
+                                       heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).hIdx];
+                                       heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).cIdx];
+                               }
 
-                       LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
+                               LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel);
 
-                       // decoding
-                       PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
-                                                                       heatMapWidth, heatMapHeight, heatMapChannel,
-                                                                       number_of_landmarks);
+                               // decoding
+                               PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta,
+                                                                               heatMapWidth, heatMapHeight, heatMapChannel,
+                                                                               number_of_landmarks, idx);
 
-                       // initialize decorder queue with landmarks to be decoded.
-                       int ret = poseDecoder.init();
-                       if (ret != MEDIA_VISION_ERROR_NONE) {
-                               LOGE("Fail to init poseDecoder");
-                               return ret;
-                       }
+                               // initialize decorder queue with landmarks to be decoded.
+                               int ret = poseDecoder.init();
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to init poseDecoder");
+                                       return ret;
+                               }
 
-                       float inputW = 1.f;
-                       float inputH = 1.f;
+                               float inputW = 1.f;
+                               float inputH = 1.f;
 
-                       if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
-                               inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.GetWidth());
-                               inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.GetHeight());
-                       }
+                               if (outputMeta.GetLandmarkCoordinate(idx) == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
+                                       inputW = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.GetWidth());
+                                       inputH = static_cast<float>(mMetadata.GetInputMeta().GetLayer().begin()->second.GetHeight());
+                               }
 
-                       float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 :
-                                                                                                               outputMeta.GetLandmarkHeatMapInfo().nmsRadius;
+                               float thresRadius = outputMeta.GetLandmarkType(idx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 :
+                                                                                                                       outputMeta.GetLandmarkHeatMapInfo(idx).nmsRadius;
 
-                       poseDecoder.decode(inputW, inputH, thresRadius);
-                       LOGE("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
-                       for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
-                               results->locations.push_back(
-                                       cv::Point3f(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
-                                                         poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height),
-                                                         poseDecoder.getPointZ(0, landmarkIndex)));
+                               poseDecoder.decode(inputW, inputH, thresRadius);
+                               LOGE("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
+                               for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
+                                       results->locations.push_back(
+                                               cv::Point3f(poseDecoder.getPointX(0, landmarkIndex) * static_cast<float>(mSourceSize.width),
+                                                               poseDecoder.getPointY(0, landmarkIndex) * static_cast<float>(mSourceSize.height),
+                                                               poseDecoder.getPointZ(0, landmarkIndex)));
+                               }
                        }
 
                        results->number_of_landmarks = results->locations.size();
index 391b265cb250c3b0fe4cc4d61307bd3f3fa0fb95..dd0c1219758d4eda836a0bd40f4ad99e833eba2b 100755 (executable)
@@ -35,7 +35,7 @@ namespace inference
                        parsed(false),
                        score(),
                        box(),
-                       landmark(),
+                       landmarks(),
                        offsetVec()
        {
                // shape_type
@@ -389,10 +389,49 @@ namespace inference
                        return MEDIA_VISION_ERROR_NONE;
                }
 
-               landmark.ParseLandmark(root);
+               int ret = MEDIA_VISION_ERROR_NONE;
+               JsonArray * rootArray = json_object_get_array_member(root, "landmark");
+               unsigned int elements = json_array_get_length(rootArray);
+               for (unsigned int elem = 0; elem < elements; ++elem) {
+                       JsonNode *pNode = json_array_get_element(rootArray, elem);
+                       JsonObject *pObject = json_node_get_object(pNode);
+                       Landmark lmark;
+                       lmark.ParseLandmark(pObject);
+
+                       if (lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
+                               lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
+                               ret = lmark.ParseDecodeInfo(pObject, mSupportedShapeType);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetLandmarkDecodeInfo[%d]", ret);
+                                       return ret;
+                               }
+                       }
+
+                       if (lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
+                               ret = ParseOffset(root);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetOffsetVector[%d]", ret);
+                                       return ret;
+                               }
+
+                               ret = lmark.ParseDisplacement(root, mSupportedShapeType);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetDispVector[%d]", ret);
+                                       return ret;
+                               }
+
+                               ret = lmark.ParseEdgeMap(root);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetEdgeConnection[%d]", ret);
+                                       return ret;
+                               }
+                       }
+
+                       landmarks.push_back(lmark);
+               }
 
                LOGI("LEAVE");
-               return MEDIA_VISION_ERROR_NONE;
+               return ret;
        }
 
        int OutputMetadata::ParseOffset(JsonObject *root)
@@ -467,37 +506,6 @@ namespace inference
                        return ret;
                }
 
-               if (!landmark.GetName().empty()) {
-                       if (landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
-                               landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
-                               ret = landmark.ParseDecodeInfo(root, mSupportedShapeType);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GetLandmarkDecodeInfo[%d]", ret);
-                                       return ret;
-                               }
-                       }
-
-                       if (landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
-                               ret = ParseOffset(root);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GetOffsetVector[%d]", ret);
-                                       return ret;
-                               }
-
-                               ret = landmark.ParseDisplacement(root, mSupportedShapeType);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GetDispVector[%d]", ret);
-                                       return ret;
-                               }
-
-                               ret = landmark.ParseEdgeMap(root);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GetEdgeConnection[%d]", ret);
-                                       return ret;
-                               }
-                       }
-               }
-
                parsed = true;
 
                LOGI("LEAVE");
index 0cadd8a7120393fc02cd9b19b80aa9502c2fbe0c..bc124d064e2133daa7ad4907ee227bad8ef7e681 100644 (file)
@@ -50,8 +50,8 @@ namespace inference
        {
                LOGI("ENTER");
 
-               if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS ||
-                       mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
+               if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS ||
+                       mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
                        LOGI("Skip init");
                        return MEDIA_VISION_ERROR_NONE;
                }
@@ -64,8 +64,8 @@ namespace inference
 
                mCandidates.clear();
 
-               if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                       mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
+               if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                       mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                        mCandidates.resize(mHeatMapChannel);
                }
 
@@ -83,8 +83,8 @@ namespace inference
                                        if (score < mMeta.GetScoreThreshold())
                                                continue;
 
-                                       if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                                               mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
+                                       if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                                               mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                                                if (score <= candidate->score)
                                                        continue;
 
@@ -119,7 +119,7 @@ namespace inference
                                                        continue;
 
                                                // add this to list
-                                               LOGI("[%d x %d][%d]: score %.3f", y, x, c, score);
+                                               //LOGI("[%d x %d][%d]: score %.3f", y, x, c, score);
                                                std::list<LandmarkPoint>::iterator iter;
                                                for (iter = mCandidates.begin(); iter != mCandidates.end(); ++iter) {
                                                        if ((*iter).score < score) {
@@ -239,27 +239,27 @@ namespace inference
 
                LandmarkPoint initValue = {0.0f, cv::Point(0,0), cv::Point3f(0.0f, 0.0f, 0.0f), -1, false};
 
-               if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                       mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
+               if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                       mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                        mPoseLandmarks.resize(1);
 
-                       if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS ||
-                               mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
+                       if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS ||
+                               mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
                                mPoseLandmarks[0].landmarks.resize(mNumberOfLandmarks);
                        } else {
                                mPoseLandmarks[0].landmarks.resize(mHeatMapChannel);
                        }
                }
 
-               if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
-                       mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
+               if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP ||
+                       mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
                        while (!mCandidates.empty()) {
 
                                LandmarkPoint &root = mCandidates.front();
 
                                getIndexToPos(root, scaleWidth, scaleHeight);
 
-                               if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE) {
+                               if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE) {
                                        root.valid = true;
                                        mPoseLandmarks[0].landmarks[root.id] = root;
                                        mPoseLandmarks[0].score += root.score;
@@ -311,11 +311,11 @@ namespace inference
                        for (auto& pose : mPoseLandmarks) {
                                pose.score /= static_cast<float>(mHeatMapChannel);
                        }
-               } else if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
-                       int landmarkOffset = mMeta.GetLandmarkOffset();
+               } else if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) {
+                       int landmarkOffset = mMeta.GetLandmarkOffset(mIdx);
                        for (int idx = 0; idx < mNumberOfLandmarks; ++idx) {
-                                       float py = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset);
-                                       float px = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset + 1);
+                                       float py = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset);
+                                       float px = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 1);
                                        float pscore = mTensorBuffer.getValue<float>(mMeta.GetScoreName(), idx * landmarkOffset + 2);
 
                                        mPoseLandmarks[0].landmarks[idx].score = pscore;
@@ -351,19 +351,23 @@ namespace inference
                                }
                        }
 
-                       int landmarkOffset = (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
-                                                                 mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_MULTI) ? 2 : 3;
+                       int landmarkOffset = (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                                                                 mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_MULTI) ? 2 : 3;
                        if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
-                               landmarkOffset = mMeta.GetLandmarkOffset();
+                               landmarkOffset = mMeta.GetLandmarkOffset(mIdx);
                        }
 
                        LOGE("landmark count : %d", mNumberOfLandmarks);
                        LOGE("landmark offset: %d", landmarkOffset);
                        LOGE("scale width x height: %.3fx%.3f", scaleWidth, scaleHeight);
                        for (int idx = 0; idx < mNumberOfLandmarks; ++idx) {
-                                       float px = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset);
-                                       float py = landmarkOffset >= 2 ? mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset + 1) : 0.0f;
-                                       float pz = landmarkOffset >= 3 ? mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset + 2) : 0.0f;
+                                       float px = mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset);
+                                       float py = landmarkOffset >= 2 ?
+                                                                               mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 1) :
+                                                                               0.0f;
+                                       float pz = landmarkOffset >= 3 ?
+                                                                               mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 2) :
+                                                                               0.0f;
 
                                        mPoseLandmarks[0].landmarks[idx].score = landmarkOffset < 5 ? poseScore : mTensorBuffer.getValue<float>(mMeta.GetLandmarkName(), idx * landmarkOffset + 4);
                                        mPoseLandmarks[0].landmarks[idx].heatMapLoc = cv::Point(-1, -1);
index f8086a0c70a6e85069249287131c318777f024d2..51394cf54826e8bd24bcf8f8aac81f10b8a44861 100644 (file)
 #define MAX_STRING_LENGTH 1024
 #define ARRAY_SIZE(x) (sizeof((x)) / sizeof((x)[0]))
 #define MAX_FRAMES 1800 // 30 fps * 60s 
-#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.tflite"
-#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.json"
+//#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.tflite"
+//#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.json"
+#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/face_landmark_with_attention.tflite"
+#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/face_landmark_with_attention.json"
 #define FD_MODEL_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_128x128.tflite"
 #define FD_META_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_128x128.json"
 #define FD_LABEL_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_label.txt"
@@ -98,6 +100,7 @@ typedef struct _appdata {
        int flandmark_num;
        int numFrame;
        CairoOverlayState *overlay_state;
+       int input;
 } Appdata;
 
 
@@ -122,7 +125,7 @@ GstElement *pipeline;
 
 // Gstreamer - camera src
 GstElement *facecam, *source, *filter, *vconv, *tee;
-GstElement *sdec, *sscale;
+GstElement *sdec, *sscale, *srate;
 GstElement *queue1, *queue2, *queue3;
 GstElement *vscale, *vsfilter;
 
@@ -198,7 +201,7 @@ static gboolean bus_call (GstBus *bus, GstMessage *msg, gpointer data)
 
 static void cairo_overlay_handler(GstElement *overlay, cairo_t *cr, guint64 timestamp, guint64 duration, gpointer user_data)
 {
-       printf("cairo_overlay_handler\n");
+       //printf("cairo_overlay_handler\n");
        Appdata *appdata = static_cast<Appdata *>(user_data);
        CairoOverlayState *s = (CairoOverlayState *)appdata->overlay_state;
        if (!s->valid)
@@ -212,12 +215,10 @@ static void cairo_overlay_handler(GstElement *overlay, cairo_t *cr, guint64 time
                        cairo_arc(cr, faceSkeleton.fLmark[pt].x, faceSkeleton.fLmark[pt].y, LD_RADIUS, LD_START_ANGLE, LD_END_ANGLE);
                        cairo_fill(cr);
                }
-               /*
                cairo_rectangle(cr, faceSkeleton.fRoi.point.x,
                                                        faceSkeleton.fRoi.point.y,
                                                        faceSkeleton.fRoi.width,
                                                        faceSkeleton.fRoi.height);
-               */
        }
 
        cairo_stroke(cr);
@@ -251,16 +252,32 @@ static void _facial_landmark_cb(mv_source_h source,
        float smoothingCoeff = 0.2f;
        float maxAlpha = 0.8f;
 
+       unsigned int width, height, bufferSize;
+       unsigned char *buffer = nullptr;
+       mv_source_get_width(source, &width);
+       mv_source_get_height(source, &height);
+       mv_source_get_buffer(source, &buffer, &bufferSize);
+
+       cv::Mat result(cv::Size(width, height), CV_8UC3, buffer);
        for (int pt=0; pt < landmarks; pt++) {
                x = static_cast<float>(locations[pt].x) / 192.f * static_cast<float>(faceSkeleton.fRoi.width);
                y = static_cast<float>(locations[pt].y) / 192.f * static_cast<float>(faceSkeleton.fRoi.height);
                faceSkeleton.fLmark[pt].x = static_cast<int>(x) + faceSkeleton.fRoi.point.x;
                faceSkeleton.fLmark[pt].y = static_cast<int>(y) + faceSkeleton.fRoi.point.y;
                faceSkeleton.fLmark[pt].z = locations[pt].z;
+
+               // 0 ~ 79: lips
+               // 80 ~ 150: left eye
+               // 151 ~ 221: right eye
+               cv::circle(result, cv::Point(locations[pt].x, locations[pt].y), 1, pt < 222 ? cv::Scalar(0,255,0) : cv::Scalar(255,0,0));
+
+               /*
                printf("%d: x[%d], y[%d], z[%f]\n", pt, faceSkeleton.fLmark[pt].x,
                                                                                                faceSkeleton.fLmark[pt].y,
                                                                                                faceSkeleton.fLmark[pt].z);
+               */
        }
+       cv::imwrite("/tmp/result.png", result);
 }
 
 static gboolean
@@ -375,9 +392,14 @@ static void fd_handoff(GstElement *object, GstBuffer *buffer, GstPad *pad, gpoin
 }
 int createPipelineCam(Appdata& appdata)
 {
-       source = gst_element_factory_make("v4l2src", "src");
+       if (appdata.input == 0) {
+               source = gst_element_factory_make("v4l2src", "src");
+       } else {
+               source = gst_element_factory_make("multifilesrc", "src");
+       }
        sdec = gst_element_factory_make("jpegdec", "sdec");
        sscale = gst_element_factory_make("videoscale", "sscale");
+       srate = gst_element_factory_make("videorate", "srate");
        filter = gst_element_factory_make("capsfilter", "filter");
 
        
@@ -391,7 +413,7 @@ int createPipelineCam(Appdata& appdata)
        vsfilter = gst_element_factory_make("capsfilter", "vsfilter");
        vconv = gst_element_factory_make("videoconvert", "convert");
        vcfilter = gst_element_factory_make("capsfilter", "vcfilter");
-       vrate = gst_element_factory_make("videorate", "rate");
+       vrate = gst_element_factory_make("videorate", "vrate");
        vrfilter = gst_element_factory_make("capsfilter", "vrfilter");
        vrsink = gst_element_factory_make("fakesink", "vrsink");
 
@@ -409,7 +431,7 @@ int createPipelineCam(Appdata& appdata)
        vcrscfilter = gst_element_factory_make("capsfilter", "vcrscfilter");
        vcrssink = gst_element_factory_make("fakesink", "vcrssink");
 
-       if (!facecam || !source || !filter || !sdec || !sscale ||
+       if (!facecam || !source || !filter || !sdec || !sscale || !srate ||
                !tee || !queue1 || !vscale || !vsfilter || !vconv || !vcfilter ||
                !vrate || !vrfilter || !vrsink ||
                !queue2 || !oconv || !coverlay || !sink || !sink2 ||
@@ -421,7 +443,13 @@ int createPipelineCam(Appdata& appdata)
        g_signal_connect(coverlay, "draw", G_CALLBACK(cairo_overlay_handler), &appdata);
        g_signal_connect(coverlay, "caps-changed", G_CALLBACK (prepare_overlay), &appdata);
 
-       g_object_set(G_OBJECT(source), "device", "/dev/video0", NULL);
+       if (appdata.input == 0) {
+               g_object_set(G_OBJECT(source), "device", "/dev/video2", NULL);
+       } else {
+               g_object_set(G_OBJECT(source), "location", "/tmp/sample.jpg", NULL);
+               g_object_set(G_OBJECT(source), "loop", TRUE, NULL);
+       }
+
        g_object_set(G_OBJECT(sink2), "use-tbm", FALSE, NULL);
        g_object_set(G_OBJECT(sink2), "sync", FALSE, NULL);
        g_object_set(G_OBJECT(sink), "video-sink", sink2, NULL);
@@ -453,14 +481,17 @@ int createPipelineCam(Appdata& appdata)
 
 
        gst_bin_add_many(GST_BIN(facecam),
-                                       source, sdec, sscale, filter,
+                                       source, sdec, sscale, srate, filter,
                                        tee, queue1, vscale, vsfilter, vconv, vcfilter,
                                        vrate, vrfilter, vrsink,
                                        queue2, oconv, coverlay, sink,
                                        queue3, vcrop, vcrscale, vcrsfilter, vcrsconv, vcrscfilter, vcrssink, NULL);
 
        /* link elements */
-       gst_element_link_many(source, sdec, sscale, filter, tee, NULL);
+       if (appdata.input == 0 )
+               gst_element_link_many(source, /*sdec,*/ sscale, filter, tee, NULL);
+       else
+               gst_element_link_many(source, sdec, sscale, srate, filter, tee, NULL);
        // pose
        gst_element_link_many (tee, queue3, vcrop, vcrscale, vcrsfilter, vcrsconv, vcrscfilter, vcrssink, NULL);
        // display
@@ -490,6 +521,11 @@ int main(int argc, char *argv[])
        appdata.numFrame = 0;
        appdata.flandmark_num = 0;
        appdata.overlay_state = g_new0(CairoOverlayState, 1);
+       if (argc == 2)
+               appdata.input = atoi(argv[1]); // 0: gst camera, 1: gst image file
+       else
+               appdata.input = 0;
+
        int ret = MEDIA_VISION_ERROR_NONE;
        printf("enter main\n");