Add BoxInfo, Label, Number classes to OutputMetadata for object detection
authorTae-Young Chung <ty83.chung@samsung.com>
Tue, 27 Apr 2021 01:57:38 +0000 (10:57 +0900)
committerInki Dae <inki.dae@samsung.com>
Fri, 4 Jun 2021 03:00:10 +0000 (12:00 +0900)
BoxInfo, Label, Number classes are required for object detection.
As an example, meta files of goolge hosted models for object detection
are also added.

Change-Id: I968267c1108a5e79ce9fbadbb5ec0d258fc38f8b
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
meta-template/od_mobilenet_v1_ssd_postop_300x300.json [new file with mode: 0644]
meta-template/od_mobilenet_v2_ssd_320x320.json [new file with mode: 0644]
mv_inference/inference/include/Metadata.h
mv_inference/inference/include/OutputMetadata.h
mv_inference/inference/src/Inference.cpp
mv_inference/inference/src/Metadata.cpp
mv_inference/inference/src/OutputMetadata.cpp

diff --git a/meta-template/od_mobilenet_v1_ssd_postop_300x300.json b/meta-template/od_mobilenet_v1_ssd_postop_300x300.json
new file mode 100644 (file)
index 0000000..e9aa00b
--- /dev/null
@@ -0,0 +1,58 @@
+{
+    "inputmetadata" :
+    {
+        "tensor_info" : [
+            {
+                "name" : "normalized_input_image_tensor",
+                "shape_type" : 1,
+                "shape_dims" : [ 1, 300, 300, 3],
+                "data_type" : 0,
+                "color_space" : "RGB888"
+            }
+        ],
+        "preprocess" : [
+            {
+                "normalization" : [
+                    {
+                        "mean" : [127.5, 127.5, 127.5],
+                        "std" : [127.5, 127.5, 127.5]
+                    }
+                ]
+            }
+        ]
+    },
+    "outputmetadata" :
+    {
+        "score" : [
+            {
+                "name" : "TFLite_Detection_PostProcess:2",
+                "index" : [-1, 1],
+                "top_number" : 5,
+                "threshold" : 0.3,
+                "score_type" : 0
+            }
+        ],
+        "box" : [
+            {
+               "name" : "TFLite_Detection_PostProcess",
+               "index" : [-1, -1, 1],
+               "box_type" : 0,
+               "box_order" : [1, 0, 3, 2],
+               "box_coordinate" : 0,
+               "decoding_type": 0
+            }
+        ],
+        "label" : [
+            {
+                "name" : "TFLite_Detection_PostProcess:1",
+                "index" : [-1, 1]
+            }
+        ],
+        "number" : [
+            {
+                "name" : "TFLite_Detection_PostProcess:3",
+                "index" : [1]
+            }
+        ]
+    }
+}
diff --git a/meta-template/od_mobilenet_v2_ssd_320x320.json b/meta-template/od_mobilenet_v2_ssd_320x320.json
new file mode 100644 (file)
index 0000000..6574f70
--- /dev/null
@@ -0,0 +1,75 @@
+{
+    "inputmetadata" :
+    {
+        "tensor_info" : [
+            {
+                "name" : "normalized_input_image_tensor",
+                "shape_type" : 1,
+                "shape_dims" : [ 1, 320, 320, 3],
+                "data_type" : 0,
+                "color_space" : "RGB888"
+            }
+        ],
+        "preprocess" : [
+            {
+                "normalization" : [
+                    {
+                        "mean" : [127.5, 127.5, 127.5],
+                        "std" : [127.5, 127.5, 127.5]
+                    }
+                ]
+            }
+        ]
+    },
+    "outputmetadata" :
+    {
+        "score" : [
+            {
+                "name" : "raw_outputs/class_predictions",
+                "index" : [-1, -1, 1],
+                "top_number" : 5,
+                "threshold" : 0.6,
+                "score_type" : 1
+            }
+        ],
+        "box" : [
+            {
+                "name" : "raw_outputs/box_encodings",
+                "index" : [-1, -1, 1],
+                "box_type" : 1,
+                "box_order" : [1, 0, 3, 2],
+                "box_coordinate" : 0,
+                "decoding_type" : 1,
+                "decoding_info" :
+                {
+                    "anchor" :
+                    {
+                        "mode" : 0,
+                        "num_layers" : 6,
+                        "min_scale" : 0.2,
+                        "max_scale"  : 0.95,
+                        "input_size_height" : 320,
+                        "input_size_width"  : 320,
+                        "anchor_offset_x" : 0.5,
+                        "anchor_offset_y" : 0.5,
+                        "strides": [16, 32, 64, 128, 256, 512],
+                        "aspect_ratios": [1.0, 2.0, 0.5, 3.0, 0.333],
+                        "reduce_boxed_in_lowest_layer": true,
+                        "interpolated_scale_aspect_ratio": 1.0,
+                        "fixed_anchor_size": false,
+                        "exponential_box_scale": true,
+                        "x_scale" : 10.0,
+                        "y_scale" : 10.0,
+                        "w_scale" : 5.0,
+                        "h_scale" : 5.0
+                    },
+                    "nms" :
+                    {
+                        "mode": 0,
+                        "threshold": 0.4
+                    }
+                }
+            }
+        ]
+    }
+}
index 322fd3e..ecf9ef6 100644 (file)
@@ -70,8 +70,8 @@ namespace inference
                 */
                int Parse();
 
-               const InputMetadata& GetInputMeta();
-               const OutputMetadata& GetOutputMeta();
+               InputMetadata& GetInputMeta();
+               OutputMetadata& GetOutputMeta();
 
        private:
                int ParseInputMeta(JsonObject *object);
index 107f7c2..22a0cb2 100644 (file)
@@ -24,6 +24,7 @@
 
 #include <mv_inference_type.h>
 #include <json-glib/json-glib.h>
+#include <opencv2/core.hpp>
 
 /**
  * @file OutputMetadata.h
@@ -37,42 +38,170 @@ namespace inference
 {
        class DimInfo
        {
+       private:
+               std::vector<int> dims;
        public:
-               std::vector<int> index;
+               std::vector<int> GetValidIndexAll() const;
+               void SetValidIndex(int index);
        };
 
        class DeQuantization
        {
-       public:
+       private:
                double scale;
                double zeropoint;
-
+       public:
                DeQuantization(double s, double z) : scale(s), zeropoint(z) {};
                ~DeQuantization() = default;
+
+               double GetScale() { return scale; }
+               double GetZeroPoint() { return zeropoint; }
        };
 
        class ScoreInfo
        {
-       public:
+       private:
                std::string name;
                DimInfo dimInfo;
                double threshold;
                int type;
                int topNumber;
-               std::unique_ptr<DeQuantization> deQuantization;
+               std::shared_ptr<DeQuantization> deQuantization;
 
        public:
                ScoreInfo() = default;
                ~ScoreInfo() = default;
-               int GetIndex() const;
+
+               std::string GetName() { return name; }
+               DimInfo GetDimInfo() { return dimInfo; }
+               double GetThresHold() { return threshold; }
+               int GetType() { return type; }
+               int GetTopNumber() { return topNumber; }
+               std::shared_ptr<DeQuantization> GetDeQuant() { return deQuantization; }
+
+               int ParseScore(JsonObject *root);
        };
 
-       class OutputMetadata
+       class BoxInfo
        {
        public:
+               class DecodeInfo {
+               public:
+                       class AnchorParam {
+                       public:
+                               int mode; /**< 0: generate anchor, 1:load pre-anchor*/
+                               int numLayers;
+                               float minScale;
+                               float maxScale;
+                               int inputSizeHeight;
+                               int inputSizeWidth;
+                               float anchorOffsetX;
+                               float anchorOffsetY;
+                               std::vector<int> strides;
+                               std::vector<float> aspectRatios;
+                               bool isReduceBoxedInLowestLayer;
+                               float interpolatedScaleAspectRatio;
+                               bool isFixedAnchorSize;
+                               bool isExponentialBoxScale;
+                               float xScale;
+                               float yScale;
+                               float wScale;
+                               float hScale;
+
+                               AnchorParam() = default;
+                               ~AnchorParam() =  default;
+                       };
+
+                       class NMSParam {
+                       public:
+                               int mode; /**< 0: IOU */
+                               float threshold;
+
+                               NMSParam() : mode(-1), threshold(0.2f) {};
+                               ~NMSParam() = default;
+                       };
+
+                       AnchorParam anchorParam;
+                       std::vector<cv::Rect2f> anchorBoxes;
+
+                       NMSParam nmsParam;
+
+                       DecodeInfo() = default;
+                       ~DecodeInfo() = default;
+               };
+
+       private:
+               std::string name;
+               DimInfo dimInfo;
+               int type; // 0:LTRB, 1: CxCyWH
+               std::vector<int> order; // Order based on box type
+               int coordinate; // 0: ratio, 1: pixel
+               int decodingType; // 0: post-op, 1: achorbox(ssd), 2:yolo(?)
+               DecodeInfo decodingInfo;
+
+       public:
+               BoxInfo() = default;
+               ~BoxInfo() = default;
+
+               std::string GetName() { return name; }
+               DimInfo GetDimInfo() { return dimInfo; }
+               int GetType() { return type; }
+               std::vector<int> GetOrder() { return order; }
+               int GetCoordinate() { return coordinate; }
+               int GetDecoddingType() { return decodingType; }
+               DecodeInfo& GetDecodeInfo() {return decodingInfo; }
+
+               int ParseBox(JsonObject *root);
+       };
+
+       class Label
+       {
+       private:
+               std::string name;
+               DimInfo dimInfo;
+
+       public:
+               Label() = default;
+               ~Label() = default;
+               std::string GetName() { return name; }
+               DimInfo GetDimInfo() { return dimInfo; }
+
+               int ParseLabel(JsonObject *root);
+       };
+
+       class Number
+       {
+       private:
+               std::string name;
+               DimInfo dimInfo;
+
+       public:
+               Number() = default;
+               ~Number() = default;
+               std::string GetName() { return name; }
+               DimInfo GetDimInfo() { return dimInfo; }
+
+               int ParseNumber(JsonObject *root);
+       };
+
+       class OutputMetadata
+       {
+       private:
                bool parsed;
                ScoreInfo score;
+               BoxInfo box;
+               Label label;
+               Number number;
+
+               int ParseScore(JsonObject *root);
+               int ParseBox(JsonObject *root);
+               int ParseLabel(JsonObject *root);
+               int ParseNumber(JsonObject *root);
+               int ParseBoxDecodeInfo(JsonObject *root);
+               int GenerateAnchor();
+               float CalculateScale(float min, float max, int index, int maxStride);
 
+       public:
                /**
                 * @brief   Creates an OutputMetadata class instance.
                 *
@@ -94,9 +223,11 @@ namespace inference
                 */
                int Parse(JsonObject *root);
 
-       private:
-               int GetScore(JsonObject *root);
-
+               bool IsParsed();
+               ScoreInfo& GetScore();
+               BoxInfo& GetBox();
+               Label& GetLabel();
+               Number& GetNumber();
        };
 
 } /* Inference */
index 061c00f..d398a82 100644 (file)
@@ -457,10 +457,10 @@ namespace inference
 
                mConfig.mOutputLayerNames = names;
 
-               const OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
-               if (outputMeta.parsed) {
+               OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               if (outputMeta.IsParsed()) {
                        mConfig.mOutputLayerNames.clear();
-                       mConfig.mOutputLayerNames.push_back(outputMeta.score.name);
+                       mConfig.mOutputLayerNames.push_back(outputMeta.GetScore().GetName());
                }
 
                inference_engine_layer_property property;
@@ -1137,39 +1137,43 @@ namespace inference
        int Inference::GetClassficationResults(
                        ImageClassificationResults *classificationResults)
        {
-               const OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
-               if (outputMeta.parsed) {
+               OutputMetadata& outputMeta = mMetadata.GetOutputMeta();
+               if (outputMeta.IsParsed()) {
                        std::vector<std::pair<float, int>> topScore;
                        float value = 0.0f;
-                       auto& info = outputMeta.score;
+                       auto& info = outputMeta.GetScore();
 
-                       int index = info.GetIndex();
-                       int classes = mOutputLayerProperty.layers[info.name].shape[index];
+                       std::vector<int> indexes = info.GetDimInfo().GetValidIndexAll();
+                       if (indexes.size() != 1) {
+                               LOGE("Invalid dim size. It should be 1");
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
+                       int classes = mOutputLayerProperty.layers[info.GetName()].shape[indexes[0]];
 
-                       if (!mOutputTensorBuffers.exist(info.name)) {
+                       if (!mOutputTensorBuffers.exist(info.GetName())) {
                                LOGE("output buffe is NULL");
                                return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
 
-                       mPostProc.ScoreClear(info.topNumber);
+                       mPostProc.ScoreClear(info.GetTopNumber());
                        for (int cId = 0; cId < classes; ++cId) {
                                try {
-                                       value = mOutputTensorBuffers.getValue<float>(info.name, cId);
+                                       value = mOutputTensorBuffers.getValue<float>(info.GetName(), cId);
                                } catch (const std::exception& e) {
                                        LOGE(" Fail to get getValue with %s", e.what());
                                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                }
 
-                               if (info.deQuantization) {
+                               if (info.GetDeQuant()) {
                                        value = PostProcess::dequant(value,
-                                                                                       info.deQuantization->scale,
-                                                                                       info.deQuantization->zeropoint);
+                                                                                       info.GetDeQuant()->GetScale(),
+                                                                                       info.GetDeQuant()->GetZeroPoint());
                                }
-                               if (info.type == 1) {
+                               if (info.GetType() == 1) {
                                        value = PostProcess::sigmoid(value);
                                }
 
-                               if (value < info.threshold)
+                               if (value < info.GetThresHold())
                                        continue;
 
                                LOGI("id[%d]: %.3f", cId, value);
@@ -1180,7 +1184,7 @@ namespace inference
                        ImageClassificationResults results;
                        results.number_of_classes = 0;
                        for (auto& value : topScore) {
-                               LOGI("score: %.3f, threshold: %.3f", value.first, info.threshold);
+                               LOGI("score: %.3f, threshold: %.3f", value.first, info.GetThresHold());
                                LOGI("idx:%d", value.second);
                                LOGI("classProb: %.3f", value.first);
 
index bb42557..b2ae9ff 100644 (file)
@@ -108,12 +108,12 @@ namespace inference
                return ret;
        }
 
-       const InputMetadata& Metadata::GetInputMeta()
+       InputMetadata& Metadata::GetInputMeta()
        {
                return mInputMeta;
        }
 
-       const OutputMetadata& Metadata::GetOutputMeta()
+       OutputMetadata& Metadata::GetOutputMeta()
        {
                return mOutputMeta;
        }
index 621b67b..7865787 100644 (file)
@@ -27,7 +27,53 @@ namespace mediavision
 {
 namespace inference
 {
-       int OutputMetadata::GetScore(JsonObject *root)
+       int ScoreInfo::ParseScore(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               JsonArray * rootArray = json_object_get_array_member(root, "score");
+               unsigned int elements = json_array_get_length(rootArray);
+
+               for (unsigned int elem = 0; elem < elements; ++elem) {
+                       JsonNode *pNode = json_array_get_element(rootArray, elem);
+                       JsonObject *pObject = json_node_get_object(pNode);
+
+                       name = json_object_get_string_member(pObject,"name");
+                       LOGI("layer: %s", name.c_str());
+
+                       JsonArray * array = json_object_get_array_member(pObject, "index");
+                       unsigned int elements2 = json_array_get_length(array);
+                       LOGI("range dim: size[%u]", elements2);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
+                                       dimInfo.SetValidIndex(elem2);
+                       }
+
+                       topNumber = static_cast<int>(json_object_get_int_member(pObject, "top_number"));
+                       LOGI("top number: %d", topNumber);
+
+                       threshold = static_cast<double>(json_object_get_double_member(pObject, "threshold"));
+                       LOGI("threshold: %1.3f", threshold);
+
+                       type = static_cast<int>(json_object_get_int_member(pObject, "score_type"));
+                       LOGI("score type: %d", type);
+
+                       if (json_object_has_member(pObject, "dequantization")) {
+                               array = json_object_get_array_member(pObject, "dequantization");
+                               JsonNode *node = json_array_get_element(array, 0);
+                               JsonObject *object = json_node_get_object(node);
+
+                               deQuantization = std::make_shared<DeQuantization>(
+                                       json_object_get_double_member(object, "scale"),
+                                       json_object_get_double_member(object, "zeropoint"));
+                       }
+               }
+
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int OutputMetadata::ParseScore(JsonObject *root)
        {
                LOGI("ENTER");
 
@@ -37,64 +83,423 @@ namespace inference
                        return MEDIA_VISION_ERROR_NONE;
                }
 
-               // score
-               JsonArray * rootArray = json_object_get_array_member(root, "score");
+               score.ParseScore(root);
+
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int BoxInfo::ParseBox(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               JsonArray * rootArray = json_object_get_array_member(root, "box");
+               unsigned int elements = json_array_get_length(rootArray);
+
+               for (unsigned int elem = 0; elem < elements; ++elem) {
+                       JsonNode *pNode = json_array_get_element(rootArray, elem);
+                       JsonObject *pObject = json_node_get_object(pNode);
+
+                       name = json_object_get_string_member(pObject,"name");
+                       LOGI("layer: %s", name.c_str());
+
+                       JsonArray * array = json_object_get_array_member(pObject, "index");
+                       unsigned int elements2 = json_array_get_length(array);
+                       LOGI("range dim: size[%u]", elements2);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
+                                       dimInfo.SetValidIndex(elem2);
+                       }
+
+                       type = static_cast<int>(json_object_get_int_member(pObject, "box_type"));
+                       LOGI("box type: %d", type);
+
+                       array = json_object_get_array_member(pObject, "box_order");
+                       elements2 = json_array_get_length(array);
+                       LOGI("box order should have 4 elements and it has [%u]", elements2);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               auto val = static_cast<int>(json_array_get_int_element(array, elem2));
+                               order.push_back(val);
+                               LOGI("%d", val);
+                       }
+
+                       coordinate = static_cast<int>(json_object_get_int_member(pObject, "box_coordinate"));
+                       LOGI("box coordinate: %d", coordinate);
+
+                       decodingType = static_cast<int>(json_object_get_int_member(pObject, "decoding_type"));
+                       LOGI("box decodeing type: %d", decodingType);
+               }
+
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int OutputMetadata::ParseBox(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               if (json_object_has_member(root, "box") == false) {
+                       LOGE("No box outputmetadata");
+                       LOGI("LEAVE");
+                       return MEDIA_VISION_ERROR_NONE;
+               }
+
+               box.ParseBox(root);
+
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int Label::ParseLabel(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               JsonArray * rootArray = json_object_get_array_member(root, "label");
                unsigned int elements = json_array_get_length(rootArray);
 
                // TODO: handling error
-               // FIXEME: ScoreInfo.set()??
                for (unsigned int elem = 0; elem < elements; ++elem) {
+                       JsonNode *pNode = json_array_get_element(rootArray, elem);
+                       JsonObject *pObject = json_node_get_object(pNode);
+
+                       name = json_object_get_string_member(pObject,"name");
+                       LOGI("layer: %s", name.c_str());
+
+                       JsonArray * array = json_object_get_array_member(pObject, "index");
+                       unsigned int elements2 = json_array_get_length(array);
+                       LOGI("range dim: size[%u]", elements2);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
+                                       dimInfo.SetValidIndex(elem2);
+                       }
+               }
+
+               LOGI("LEAVEL");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int OutputMetadata::ParseLabel(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               if (json_object_has_member(root, "label") == false) {
+                       LOGE("No box outputmetadata");
+                       LOGI("LEAVE");
+                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+               }
+
+               label.ParseLabel(root);
+
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       int Number::ParseNumber(JsonObject *root)
+       {
+               // box
+               JsonArray * rootArray = json_object_get_array_member(root, "number");
+               unsigned int elements = json_array_get_length(rootArray);
 
+               // TODO: handling error
+               for (unsigned int elem = 0; elem < elements; ++elem) {
                        JsonNode *pNode = json_array_get_element(rootArray, elem);
                        JsonObject *pObject = json_node_get_object(pNode);
 
-                       score.name =
-                                               static_cast<const char*>(json_object_get_string_member(pObject,"name"));
-                       LOGI("layer: %s", score.name.c_str());
+                       name = json_object_get_string_member(pObject,"name");
+                       LOGI("layer: %s", name.c_str());
 
                        JsonArray * array = json_object_get_array_member(pObject, "index");
                        unsigned int elements2 = json_array_get_length(array);
                        LOGI("range dim: size[%u]", elements2);
                        for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
-                               auto index = static_cast<int>(json_array_get_int_element(array, elem2));
-                               score.dimInfo.index.push_back(index);
-                               LOGI("%d", index);
+                               if (static_cast<int>(json_array_get_int_element(array, elem2)) == 1)
+                                       dimInfo.SetValidIndex(elem2);
                        }
+               }
 
-                       score.topNumber = static_cast<int>(json_object_get_int_member(pObject, "top_number"));
-                       LOGI("top number: %d", score.topNumber);
+               return MEDIA_VISION_ERROR_NONE;
+       }
 
-                       score.threshold = static_cast<double>(json_object_get_double_member(pObject, "threshold"));
-                       LOGI("threshold: %1.3f", score.threshold);
+       int OutputMetadata::ParseNumber(JsonObject *root)
+       {
+               LOGI("ENTER");
 
-                       score.type = static_cast<int>(json_object_get_int_member(pObject, "score_type"));
-                       LOGI("score type: %d", score.type);
+               if (json_object_has_member(root, "number") == false) {
+                       LOGE("No number outputmetadata");
+                       LOGI("LEAVE");
+                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+               }
+               number.ParseNumber(root);
 
-                       if (json_object_has_member(pObject, "dequantization")) {
-                               array = json_object_get_array_member(pObject, "dequantization");
-                               JsonNode *node = json_array_get_element(array, 0);
-                               JsonObject *object = json_node_get_object(node);
+               LOGI("LEAVE");
+               return MEDIA_VISION_ERROR_NONE;
+       }
 
-                               score.deQuantization = std::make_unique<DeQuantization>(
-                                       json_object_get_double_member(object, "scale"),
-                                       json_object_get_double_member(object, "zeropoint"));
+       int OutputMetadata::ParseBoxDecodeInfo(JsonObject *root)
+       {
+               LOGI("ENTER");
+
+               if (json_object_has_member(root, "box") == false) {
+                       LOGE("No box outputmetadata");
+                       LOGI("LEAVE");
+                       return MEDIA_VISION_ERROR_NONE;
+               }
+
+               // box
+               JsonArray * rootArray = json_object_get_array_member(root, "box");
+               unsigned int elements = json_array_get_length(rootArray);
+
+               // TODO: handling error
+               for (unsigned int elem = 0; elem < elements; ++elem) {
+                       JsonNode *pNode = json_array_get_element(rootArray, elem);
+                       JsonObject *pObject = json_node_get_object(pNode);
+
+                       if (json_object_has_member(pObject, "decoding_info") == false) {
+                               LOGE("decoding_info is mandatory. Invalid metadata");
+                               LOGI("LEAVE");
+
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                       }
+
+                       JsonObject *cObject = json_object_get_object_member(pObject, "decoding_info");
+                       if (json_object_has_member(cObject, "anchor") == false) {
+                               LOGE("anchor is mandatory. Invalid metadata");
+                               LOGI("LEAVE");
+
+                               return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
+
+                       JsonObject *object = json_object_get_object_member(cObject, "anchor") ;
+                       BoxInfo::DecodeInfo& decodeInfo = box.GetDecodeInfo();
+                       decodeInfo.anchorParam.mode = static_cast<int>(json_object_get_int_member(object, "mode"));
+                       decodeInfo.anchorParam.numLayers = static_cast<int>(json_object_get_int_member(object, "num_layers"));
+                       decodeInfo.anchorParam.minScale = static_cast<float>(json_object_get_double_member(object, "min_scale"));
+                       decodeInfo.anchorParam.maxScale = static_cast<float>(json_object_get_double_member(object, "max_scale"));
+                       decodeInfo.anchorParam.inputSizeHeight = static_cast<int>(json_object_get_int_member(object, "input_size_height"));
+                       decodeInfo.anchorParam.inputSizeWidth = static_cast<int>(json_object_get_int_member(object, "input_size_width"));
+                       decodeInfo.anchorParam.anchorOffsetX = static_cast<float>(json_object_get_double_member(object, "anchor_offset_x"));
+                       decodeInfo.anchorParam.anchorOffsetY = static_cast<float>(json_object_get_double_member(object, "anchor_offset_y"));
+                       decodeInfo.anchorParam.isReduceBoxedInLowestLayer =
+                                                                                       static_cast<bool>(json_object_get_boolean_member(object, "reduce_boxed_in_lowest_layer"));
+                       decodeInfo.anchorParam.interpolatedScaleAspectRatio =
+                                                                                       static_cast<float>(json_object_get_double_member(object, "interpolated_scale_aspect_ratio"));
+                       decodeInfo.anchorParam.isFixedAnchorSize =
+                                                                                       static_cast<bool>(json_object_get_boolean_member(object, "fixed_anchor_size"));
+                       decodeInfo.anchorParam.isExponentialBoxScale =
+                                                                                       static_cast<bool>(json_object_get_boolean_member(object, "exponential_box_scale"));
+
+                       decodeInfo.anchorParam.xScale = static_cast<float>(json_object_get_double_member(object, "x_scale"));
+                       decodeInfo.anchorParam.yScale = static_cast<float>(json_object_get_double_member(object, "y_scale"));
+                       decodeInfo.anchorParam.wScale = static_cast<float>(json_object_get_double_member(object, "w_scale"));
+                       decodeInfo.anchorParam.hScale = static_cast<float>(json_object_get_double_member(object, "h_scale"));
+
+                       JsonArray * array = json_object_get_array_member(object, "strides");
+                       unsigned int elements2 = json_array_get_length(array);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               auto stride = static_cast<int>(json_array_get_int_element(array, elem2));
+                               decodeInfo.anchorParam.strides.push_back(stride);
+                               LOGI("stride: %d", stride);
+                       }
+
+                       array = json_object_get_array_member(object, "aspect_ratios");
+                       elements2 = json_array_get_length(array);
+                       for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                               auto aspectRatio = static_cast<float>(json_array_get_double_element(array, elem2));
+                               decodeInfo.anchorParam.aspectRatios.push_back(aspectRatio);
+                               LOGI("aspectRatio: %.4f", aspectRatio);
+                       }
+
+                       if (json_object_has_member(cObject, "nms") == false) {
+                               LOGI("nms is empty. skip it");
+                               continue;
+                       }
+
+                       object = json_object_get_object_member(cObject, "nms");
+                       decodeInfo.nmsParam.mode = static_cast<int>(json_object_get_int_member(object, "mode"));
+                       decodeInfo.nmsParam.threshold = static_cast<float>(json_object_get_double_member(object,"threshold"));
                }
 
                LOGI("LEAVE");
                return MEDIA_VISION_ERROR_NONE;
+
+       }
+
+       float OutputMetadata::CalculateScale(float min, float max, int index, int maxStride)
+       {
+               return min + (max - min) * 1.0 * index / (maxStride - 1.0f);
+       }
+
+       int OutputMetadata::GenerateAnchor()
+       {
+               BoxInfo::DecodeInfo& decodeInfo = box.GetDecodeInfo();
+
+               if (decodeInfo.anchorParam.strides.empty() ||
+                       decodeInfo.anchorParam.aspectRatios.empty()) {
+                       LOGE("Invalid anchor parameters");
+                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+               }
+
+               int layerId = 0;
+               while (layerId < decodeInfo.anchorParam.numLayers) {
+                       std::vector<float> anchorHeight;
+                       std::vector<float> anchorWidth;
+                       std::vector<float> aspectRatios;
+                       std::vector<float> scales;
+
+                       int lastSameStrideLayer = layerId;
+                       std::vector<float>::iterator iter1, iter2;
+                       while ((lastSameStrideLayer < decodeInfo.anchorParam.numLayers) &&
+                               (decodeInfo.anchorParam.strides[lastSameStrideLayer] ==
+                                decodeInfo.anchorParam.strides[layerId])) {
+                               const float scale = CalculateScale( decodeInfo.anchorParam.minScale,
+                                                                               decodeInfo.anchorParam.maxScale,
+                                                                               lastSameStrideLayer,
+                                                                               decodeInfo.anchorParam.strides.size());
+
+                               if (lastSameStrideLayer == 0 &&
+                                       decodeInfo.anchorParam.isReduceBoxedInLowestLayer) {
+                                       aspectRatios.push_back(1.0);
+                                       aspectRatios.push_back(2.0);
+                                       aspectRatios.push_back(0.5);
+                                       scales.push_back(0.1);
+                                       scales.push_back(scale);
+                                       scales.push_back(scale);
+                               } else {
+                                       for (iter1 = decodeInfo.anchorParam.aspectRatios.begin();
+                                               iter1 != decodeInfo.anchorParam.aspectRatios.end();
+                                               ++iter1) {
+                                               aspectRatios.push_back((*iter1));
+                                               scales.push_back(scale);
+                                       }
+                                       if (decodeInfo.anchorParam.interpolatedScaleAspectRatio > 0.0f) {
+                                               const float scaleNext =
+                                                       lastSameStrideLayer == (int) decodeInfo.anchorParam.strides.size() -1
+                                                                               ? 1.0f
+                                                                               : CalculateScale(decodeInfo.anchorParam.minScale,
+                                                                                               decodeInfo.anchorParam.maxScale,
+                                                                                               lastSameStrideLayer + 1,
+                                                                                               decodeInfo.anchorParam.strides.size());
+                                               scales.push_back(std::sqrt(scale * scaleNext));
+                                               aspectRatios.push_back(decodeInfo.anchorParam.interpolatedScaleAspectRatio);
+                                       }
+                               }
+                               lastSameStrideLayer++;
+                       }
+
+                       for (iter1 = aspectRatios.begin(), iter2 = scales.begin();
+                               (iter1 != aspectRatios.end() && iter2 != scales.end());
+                               ++iter1, ++iter2) {
+                               const float ratioSqrts = std::sqrt((*iter1));
+                               anchorHeight.push_back((*iter2) / ratioSqrts);
+                               anchorWidth.push_back((*iter2) * ratioSqrts);
+                       }
+
+                       const int stride = decodeInfo.anchorParam.strides[layerId];
+                       int featureMapHeight = std::ceil(1.0f * decodeInfo.anchorParam.inputSizeHeight / stride);
+                       int featureMapWidth = std::ceil(1.0f * decodeInfo.anchorParam.inputSizeWidth / stride);
+
+                       for (int y = 0; y < featureMapHeight; ++y) {
+                               for (int x = 0; x < featureMapWidth; ++x) {
+                                       for (int anchorId = 0; anchorId < (int)anchorHeight.size(); ++anchorId) {
+                                               cv::Rect2f anchor = {
+                                                       cv::Point2f {
+                                                               (x + decodeInfo.anchorParam.anchorOffsetX) * 1.0f / featureMapWidth,
+                                                               (y + decodeInfo.anchorParam.anchorOffsetY) * 1.0f / featureMapHeight
+                                                       },
+                                                       decodeInfo.anchorParam.isFixedAnchorSize ?
+                                                               cv::Size2f {1.0f, 1.0f} :
+                                                               cv::Size2f {anchorWidth[anchorId], anchorWidth[anchorId]}
+                                               };
+                                               decodeInfo.anchorBoxes.push_back(anchor);
+                                       }
+                               }
+                       }
+                       layerId = lastSameStrideLayer;
+               }
+
+               if (decodeInfo.anchorBoxes.empty()) {
+                       LOGE("Anchor boxes are empty");
+                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+               }
+
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       ScoreInfo& OutputMetadata::GetScore()
+       {
+               return score;
+       }
+
+       BoxInfo& OutputMetadata::GetBox()
+       {
+               return box;
+       }
+
+       Label& OutputMetadata::GetLabel()
+       {
+               return label;
+       }
+
+       Number& OutputMetadata::GetNumber()
+       {
+               return number;
+       }
+
+       bool OutputMetadata::IsParsed()
+       {
+               return parsed;
        }
 
        int OutputMetadata::Parse(JsonObject *root)
        {
                LOGI("ENTER");
 
-               int ret = GetScore(root);
+               int ret = ParseScore(root);
                if (ret != MEDIA_VISION_ERROR_NONE) {
                        LOGE("Fail to GetScore[%d]", ret);
                        return ret;
                }
 
+               ret = ParseBox(root);
+               if (ret != MEDIA_VISION_ERROR_NONE) {
+                       LOGE("Fail to GetBox[%d]", ret);
+                       return ret;
+               }
+
+               if (!box.GetName().empty()) {
+                       // addtional parsing is required according to decoding type
+                       if (box.GetDecoddingType() == 0) {
+
+                               ret = ParseLabel(root);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetLabel[%d]", ret);
+                                       return ret;
+                               }
+
+                               ret = ParseNumber(root);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetNumber[%d]", ret);
+                                       return ret;
+                               }
+
+                       } else if (box.GetDecoddingType() == 1) {
+                               ret = ParseBoxDecodeInfo(root);
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GetBoxDecodeInfo[%d]", ret);
+                                       return ret;
+                               }
+
+                               ret = GenerateAnchor();
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GenerateAnchor[%d]", ret);
+                                       return ret;
+                               }
+
+                       } else {
+                               LOGW("Unknow box decoding type. Ignore");
+                       }
+               }
+
                parsed = true;
 
                LOGI("LEAVE");
@@ -102,21 +507,21 @@ namespace inference
                return MEDIA_VISION_ERROR_NONE;
        }
 
-       int ScoreInfo::GetIndex() const
+       void DimInfo::SetValidIndex(int index)
        {
                LOGI("ENTER");
 
-               int ret = 0;
-               for (auto& index : dimInfo.index) {
-                       if (index > 0) {
-                               break;
-                       }
-                       ret++;
-               }
+               dims.push_back(index);
 
                LOGI("LEAVE");
+       }
+
+       std::vector<int> DimInfo::GetValidIndexAll() const
+       {
+               LOGI("ENTER");
 
-               return ret;
+               LOGI("LEAVE");
+               return dims;
        }
 } /* Inference */
 } /* MediaVision */