Change Landmark's landmark_type, landmark_coordinate, and decoding_type to string 05/263305/2
authorTae-Young Chung <ty83.chung@samsung.com>
Tue, 31 Aug 2021 08:09:49 +0000 (17:09 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Wed, 1 Sep 2021 03:32:21 +0000 (12:32 +0900)
A user can use the string while understanding purpose of
landmark_type, landmark_coordinate, and decoding_type metadata.
The string values are parsed and converted to enumeration type
inference_landmark_type_e, inference_landmark_coordinate_type_e,
and inference_landmark_decoding_type_type_e, respectively.

Change-Id: Ia3a4098213a712ffd1927838d7b9931d1edfcbd0
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
meta-template/fld_mediapipe_192x192.json
meta-template/fld_tweakcnn_128x128.json
meta-template/pld_cpm_192x192.json
meta-template/pld_mobilenet_v1_posenet_multi_257x257.json
mv_machine_learning/mv_inference/inference/include/OutputMetadata.h
mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp
mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp
packaging/capi-media-vision.spec

index 78e3b15..ee7f63c 100644 (file)
@@ -36,9 +36,9 @@
             {
                 "name" : "conv2d_20",
                 "index" : [-1, -1, -1, 1],
-                "landmark_type" : 0,
-                "landmark_coordinate" : 1,
-                "decoding_type" : 0,
+                "landmark_type" : "2D_SINGLE",
+                "landmark_coordinate" : "PIXEL",
+                "decoding_type" : "BYPASS",
                 "landmark_offset" : 3
             }
         ]
index eaeeecd..0f2148c 100644 (file)
             {
                 "name" : "fanet8ss_inference/fully_connected_1/Sigmoid",
                 "index" : [-1, 1],
-                "landmark_type" : 0,
-                "landmark_coordinate" : 0,
+                "landmark_type" : "2D_SINGLE",
+                "landmark_coordinate" : "RATIO",
                 "landmark_offset" : 2,
-                "decoding_type" : 0
+                "decoding_type" : "BYPASS"
             }
         ]
     }
index aa4ed69..a0bb6e6 100644 (file)
@@ -36,9 +36,9 @@
             {
                 "name" : "Convolutional_Pose_Machine/stage_5_out",
                 "index" : [-1, 1, 1, 1],
-                "landmark_type" : 0,
-                "landmark_coordinate" : 1,
-                "decoding_type" : 1,
+                "landmark_type" : "2D_SINGLE",
+                "landmark_coordinate" : "PIXEL",
+                "decoding_type" : "HEATMAP",
                 "decoding_info" :
                 {
                     "heatmap" :
index 671e57c..2ef057a 100644 (file)
@@ -36,9 +36,9 @@
             {
                 "name" : "MobilenetV1/heatmap_2/BiasAdd",
                 "index" : [-1, 1, 1, 1],
-                "landmark_type" : 1,
-                "landmark_coordinate" : 1,
-                "decoding_type" : 2,
+                "landmark_type" : "2D_MULTI",
+                "landmark_coordinate" : "PIXEL",
+                "decoding_type" : "HEATMAP_REFINE",
                 "decoding_info" :
                 {
                     "heatmap" :
index 07488ec..fe917ad 100644 (file)
@@ -287,29 +287,37 @@ namespace inference
                                int cIdx;
                                inference_tensor_shape_type_e shapeType;
                                float nmsRadius;
+                               HeatMapInfo() = default;
+                               ~HeatMapInfo() = default;
                        };
                        HeatMapInfo heatMap;
+                       DecodeInfo() = default;
+                       ~DecodeInfo() = default;
                };
        private:
                std::string name;
                DimInfo dimInfo;
-               int type; /**< 0: 2d-single, 1: 2d-multi, 2: 3-single */
+               inference_landmark_type_e type; /**< 0: 2D_SINGLE, 1: 2D_MULTI, 2: 3D_SINGLE */
                int offset;
-               int coordinate; /**< 0: ratio, 1: pixel */
-               int decodingType; /**< 0: decoding  unnecessary,
-                                                       1: decoding heatmap,
-                                                       2: decoding heatmap with additional refine data */
+               inference_landmark_coorindate_type_e coordinate; /**< 0: RATIO, 1: PIXEL */
+               inference_landmark_decoding_type_e decodingType; /**< 0: decoding  unnecessary,
+                                                                                                                       1: decoding heatmap,
+                                                                                                                       2: decoding heatmap with refinement */
                DecodeInfo decodingInfo;
 
+               std::map<std::string, inference_landmark_type_e> supportedLandmarkTypes;
+               std::map<std::string, inference_landmark_coorindate_type_e> supportedLandmarkCoordinateTypes;
+               std::map<std::string, inference_landmark_decoding_type_e> supportedLandmarkDecodingTypes;
+
        public:
-               Landmark() = default;
+               Landmark();
                ~Landmark() = default;
                std::string GetName() { return name; }
                DimInfo GetDimInfo() { return dimInfo; }
-               int GetType();
+               inference_landmark_type_e GetType();
                int GetOffset();
-               int GetCoordinate();
-               int GetDecodingType();
+               inference_landmark_coorindate_type_e GetCoordinate();
+               inference_landmark_decoding_type_e GetDecodingType();
                DecodeInfo& GetDecodingInfo();
 
                int ParseLandmark(JsonObject *root);
index 523d0cb..0a0aadc 100644 (file)
@@ -52,6 +52,24 @@ namespace inference
                INFERENCE_BOX_NMS_TYPE_NONE = -1,
                INFERENCE_BOX_NMS_TYPE_STANDARD
        } inference_box_nms_type_e;
+
+       // landmark
+       typedef enum {
+               INFERENCE_LANDMARK_TYPE_2D_SINGLE,
+               INFERENCE_LANDMARK_TYPE_2D_MULTI,
+               INFERENCE_LANDMARK_TYPE_3D_SINGLE
+       } inference_landmark_type_e;
+
+       typedef enum {
+               INFERENCE_LANDMARK_COORDINATE_TYPE_RATIO,
+               INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL
+       } inference_landmark_coorindate_type_e;
+
+       typedef enum {
+               INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
+               INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP,
+               INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE
+       } inference_landmark_decoding_type_e;
 }
 }
 
index 3db0155..835bc6f 100755 (executable)
@@ -1638,7 +1638,7 @@ namespace inference
                        int heatMapWidth = 0;
                        int heatMapHeight = 0;
                        int heatMapChannel = 0;
-                       if (landmarkInfo.GetDecodingType() != 0) {
+                       if (landmarkInfo.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
                                heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
                                heatMapChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.cIdx];
@@ -1646,7 +1646,7 @@ namespace inference
 
                        int number_of_landmarks = 0;
                        std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
-                       if (landmarkInfo.GetDecodingType() == 0) {
+                       if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]);
                                number_of_landmarks = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
                                                                        / landmarkInfo.GetOffset();
@@ -1668,11 +1668,12 @@ namespace inference
 
                        float inputW = 1.f;
                        float inputH = 1.f;
-                       if (landmarkInfo.GetCoordinate() == 1) {
+                       if (landmarkInfo.GetCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
                                inputW = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetWidth());
                                inputH = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetHeight());
                        }
-                       float thresRadius = landmarkInfo.GetType() == 0 ? 0.0 : outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
+                       float thresRadius = landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 :
+                                                                                                               outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
                        poseDecoder.decode(inputW, inputH, thresRadius);
 
                        for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) {
@@ -1730,7 +1731,7 @@ namespace inference
                        int heatMapWidth = 0;
                        int heatMapHeight = 0;
                        int heatMapChannel = 0;
-                       if (landmarkInfo.GetDecodingType() != 0) {
+                       if (landmarkInfo.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                heatMapWidth = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.wIdx];
                                heatMapHeight = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.hIdx];
                                heatMapChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[landmarkInfo.GetDecodingInfo().heatMap.cIdx];
@@ -1745,14 +1746,15 @@ namespace inference
                                        return MEDIA_VISION_ERROR_INTERNAL;
                                }
                                // 2d+single or 2d+multi or 3d+single or 3d+multi
-                               int defaultNumberOfPose = (landmarkInfo.GetType() == 0 || landmarkInfo.GetType() == 2) ? 1 : MAX_NUMBER_OF_POSE;
+                               int defaultNumberOfPose = (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                                                                                  landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) ? 1 : MAX_NUMBER_OF_POSE;
                                std::vector<int> channelIndexes = landmarkInfo.GetDimInfo().GetValidIndexAll();
 
-                               // In case of DecodingType == 0,
+                               // If INFERENCE_LANDMARK_DECODING_TYPE_BYPASS,
                                // the landmarkChannel is guessed from the shape of the landmark output tensor.
-                               // Otherwise, decoding heatmap, it is guessed from the heatMapChannel.
+                               // Otherwise, it is guessed from the heatMapChannel.
                                int landmarkChannel = 0;
-                               if (landmarkInfo.GetDecodingType() == 0) {
+                               if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                        landmarkChannel = mOutputLayerProperty.layers[landmarkInfo.GetName()].shape[channelIndexes[0]]
                                                                                / landmarkInfo.GetOffset();
                                } else {
@@ -1783,8 +1785,9 @@ namespace inference
 
                        float inputW = 1.f;
                        float inputH = 1.f;
-                       float thresRadius = landmarkInfo.GetType() == 0 ? 0.0 : outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
-                       if (landmarkInfo.GetCoordinate() == 1) {
+                       float thresRadius = landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 :
+                                                                                                               outputMeta.GetLandmark().GetDecodingInfo().heatMap.nmsRadius;
+                       if (landmarkInfo.GetCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) {
                                inputW = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetWidth());
                                inputH = static_cast<float>(mMetadata.GetInputMeta().layer.begin()->second.GetHeight());
                        }
index 84c6c6d..968bea3 100755 (executable)
@@ -706,6 +706,28 @@ namespace inference
                return parsed;
        }
 
+       Landmark::Landmark() :
+                       name(),
+                       dimInfo(),
+                       type(INFERENCE_LANDMARK_TYPE_2D_SINGLE),
+                       offset(),
+                       coordinate(INFERENCE_LANDMARK_COORDINATE_TYPE_RATIO),
+                       decodingType(INFERENCE_LANDMARK_DECODING_TYPE_BYPASS),
+                       decodingInfo()
+
+       {
+               supportedLandmarkTypes.insert({"2D_SINGLE", INFERENCE_LANDMARK_TYPE_2D_SINGLE});
+               supportedLandmarkTypes.insert({"2D_MULTI",  INFERENCE_LANDMARK_TYPE_2D_MULTI});
+               supportedLandmarkTypes.insert({"3D_SINGLE", INFERENCE_LANDMARK_TYPE_3D_SINGLE});
+
+               supportedLandmarkCoordinateTypes.insert({"RATIO", INFERENCE_LANDMARK_COORDINATE_TYPE_RATIO});
+               supportedLandmarkCoordinateTypes.insert({"PIXEL", INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL});
+
+               supportedLandmarkDecodingTypes.insert({"BYPASS", INFERENCE_LANDMARK_DECODING_TYPE_BYPASS});
+               supportedLandmarkDecodingTypes.insert({"HEATMAP", INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP});
+               supportedLandmarkDecodingTypes.insert({"HEATMAP_REFINE", INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE});
+       }
+
        int Landmark::ParseLandmark(JsonObject *root)
        {
                // box
@@ -730,24 +752,24 @@ namespace inference
                                        dimInfo.SetValidIndex(elem2);
                        }
 
-                       type = static_cast<int>(json_object_get_int_member(pObject, "landmark_type"));
-                       LOGI("landmark type: %d", type);
+                       try {
+                               type = OutputMetadata::GetSupportedType(pObject, "landmark_type", supportedLandmarkTypes);
+                               coordinate = OutputMetadata::GetSupportedType(pObject, "landmark_coordinate", supportedLandmarkCoordinateTypes);
+                               decodingType = OutputMetadata::GetSupportedType(pObject, "decoding_type", supportedLandmarkDecodingTypes);
+                       } catch (const std::exception& e) {
+                               LOGE("Invalid %s", e.what());
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
 
                        offset = static_cast<int>(json_object_get_int_member(pObject, "landmark_offset"));
                        LOGI("landmark offset: %d", offset);
-
-                       coordinate = static_cast<int>(json_object_get_int_member(pObject, "landmark_coordinate"));
-                       LOGI("landmark coordinate: %d", coordinate);
-
-                       decodingType = static_cast<int>(json_object_get_int_member(pObject, "decoding_type"));
-                       LOGI("landmark decodeing type: %d", decodingType);
                }
 
                LOGI("LEAVE");
                return MEDIA_VISION_ERROR_NONE;
        }
 
-       int Landmark::GetType()
+       inference_landmark_type_e Landmark::GetType()
        {
                return type;
        }
@@ -757,12 +779,12 @@ namespace inference
                return offset;
        }
 
-       int Landmark::GetCoordinate()
+       inference_landmark_coorindate_type_e Landmark::GetCoordinate()
        {
                return coordinate;
        }
 
-       int Landmark::GetDecodingType()
+       inference_landmark_decoding_type_e Landmark::GetDecodingType()
        {
                return decodingType;
        }
@@ -1046,8 +1068,7 @@ namespace inference
                }
 
                if (!landmark.GetName().empty()) {
-                       if (landmark.GetDecodingType() == 1 ||
-                               landmark.GetDecodingType() == 2) {
+                       if (landmark.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                ret = ParseLandmarkDecodeInfo(root);
                                if (ret != MEDIA_VISION_ERROR_NONE) {
                                        LOGE("Fail to GetLandmarkDecodeInfo[%d]", ret);
@@ -1055,7 +1076,7 @@ namespace inference
                                }
                        }
 
-                       if (landmark.GetDecodingType() == 2) {// landmark.decodingType == 2
+                       if (landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) {
                                ret = ParseOffset(root);
                                if (ret != MEDIA_VISION_ERROR_NONE) {
                                        LOGE("Fail to GetOffsetVector[%d]", ret);
index 7711673..9798dfc 100644 (file)
@@ -52,12 +52,13 @@ namespace inference
 
                Landmark& landmarkInfo = mMeta.GetLandmark();
 
-               if (landmarkInfo.GetType() < 0 || landmarkInfo.GetType() >= 3) {
+               if (landmarkInfo.GetType() < INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                       landmarkInfo.GetType() > INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                        LOGE("Not supported landmark type");
                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                }
 
-               if (landmarkInfo.GetDecodingType() == 0) {
+               if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                        LOGI("Skip init");
                        return MEDIA_VISION_ERROR_NONE;
                }
@@ -71,8 +72,8 @@ namespace inference
 
                mCandidates.clear();
 
-               if (landmarkInfo.GetType() == 0 ||
-                       landmarkInfo.GetType() == 2) {
+               if (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                       landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                        mCandidates.resize(mHeatMapChannel);
                }
 
@@ -90,8 +91,8 @@ namespace inference
                                        if (score < scoreInfo.GetThresHold())
                                                continue;
 
-                                       if (landmarkInfo.GetType() == 0 ||
-                                               landmarkInfo.GetType() == 2) {
+                                       if (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                                               landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                                                if (score <= candidate->score)
                                                        continue;
 
@@ -245,25 +246,25 @@ namespace inference
                Landmark& landmarkInfo = mMeta.GetLandmark();
                ScoreInfo& scoreInfo = mMeta.GetScore();
 
-               if (landmarkInfo.GetType() == 0 ||
-                       landmarkInfo.GetType() == 2) { // single pose
+               if (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                       landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) {
                        mPoseLandmarks.resize(1);
 
-                       if (landmarkInfo.GetDecodingType() == 0) { // direct decoding
+                       if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                mPoseLandmarks[0].landmarks.resize(mNumberOfLandmarks);
-                       } else { // heatmap decoding
+                       } else {
                                mPoseLandmarks[0].landmarks.resize(mHeatMapChannel);
                        }
                }
 
-               if (landmarkInfo.GetDecodingType() != 0) { // heatmap decoding
+               if (landmarkInfo.GetDecodingType() != INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                        while (!mCandidates.empty()) {
 
                                LandmarkPoint &root = mCandidates.front();
 
                                getIndexToPos(root, scaleWidth, scaleHeight);
 
-                               if (landmarkInfo.GetType() == 0) {
+                               if (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE) {
                                        root.valid = true;
                                        mPoseLandmarks[0].landmarks[root.id] = root;
                                        mPoseLandmarks[0].score += root.score;
@@ -330,8 +331,9 @@ namespace inference
                                }
                        }
 
-                       int landmarkOffset = (landmarkInfo.GetType() == 0 || landmarkInfo.GetType() == 1) ? 2 : 3;
-                       if (landmarkInfo.GetDecodingType() == 0) {
+                       int landmarkOffset = (landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ||
+                                                                 landmarkInfo.GetType() == INFERENCE_LANDMARK_TYPE_2D_MULTI) ? 2 : 3;
+                       if (landmarkInfo.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) {
                                landmarkOffset = landmarkInfo.GetOffset();
                        }
                        for (int idx = 0; idx < mNumberOfLandmarks; ++idx) {
index 7aeb8a1..a1ca707 100644 (file)
@@ -1,7 +1,7 @@
 Name:        capi-media-vision
 Summary:     Media Vision library for Tizen Native API
-Version:     0.8.12
-Release:     2
+Version:     0.8.13
+Release:     0
 Group:       Multimedia/Framework
 License:     Apache-2.0 and BSD-3-Clause
 Source0:     %{name}-%{version}.tar.gz