mv_machine_learning: support YOLO-V5 inference 82/270082/1
authorTae-Young Chung <ty83.chung@samsung.com>
Tue, 25 Jan 2022 07:18:19 +0000 (16:18 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Tue, 25 Jan 2022 07:18:48 +0000 (16:18 +0900)
[Version] 0.13.0-0
[Issue type] new feature

Change-Id: I6dd5d13904e7c201693441494f87deba42f35924
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
meta-template/od_yolo_v5_320x320.json [new file with mode: 0644]
mv_machine_learning/mv_inference/inference/include/BoxInfo.h
mv_machine_learning/mv_inference/inference/include/DecodeInfo.h
mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h
mv_machine_learning/mv_inference/inference/src/Inference.cpp
mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp
mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp
packaging/capi-media-vision.spec
test/testsuites/machine_learning/inference/inference_test_suite.c

diff --git a/meta-template/od_yolo_v5_320x320.json b/meta-template/od_yolo_v5_320x320.json
new file mode 100644 (file)
index 0000000..84793b7
--- /dev/null
@@ -0,0 +1,59 @@
+{
+    "inputmetadata" :
+    {
+        "tensor_info" : [
+            {
+                "name" : "input_1:0",
+                "shape_type" : "NHWC",
+                "shape_dims" : [ 1, 320, 320, 3],
+                "data_type" : "FLOAT32",
+                "color_space" : "RGB888"
+            }
+        ],
+        "preprocess" : [
+            {
+                "normalization" : [
+                    {
+                        "mean" : [0.0, 0.0, 0.0],
+                        "std" : [255.0, 255.0, 255.0]
+                    }
+                ]
+            }
+        ]
+    },
+    "outputmetadata" :
+    {
+        "score" : [
+            {
+                "name" : "Identity:0",
+                "index" : [-1, -1, 1],
+                "top_number" : 5,
+                "threshold" : 0.4,
+                "score_type" : "NORMAL"
+            }
+        ],
+        "box" : [
+            {
+                "name" : "Identity:0",
+                "index" : [-1, -1, 1],
+                "box_type" : "ORIGIN_CENTER",
+                "box_order" : [0, 1, 2, 3],
+                "box_coordinate" : "RATIO",
+                "decoding_type" : "YOLO_ANCHOR",
+                "decoding_info" :
+                {
+                    "cell" :
+                    {
+                        "num_scales" : 3,
+                        "scales": [8, 16, 32]
+                    },
+                    "nms" :
+                    {
+                        "mode": "STANDARD",
+                        "iou_threshold": 0.2
+                    }
+                }
+            }
+        ]
+    }
+}
index 63bc28d21465343bf357786d64ace020b9cd9f89..d384ecd98531d81cd62f6ad74c3af9016c5da9af 100644 (file)
@@ -84,6 +84,7 @@ namespace box
 
                        supportedBoxDecodingTypes.insert({"BYPASS", INFERENCE_BOX_DECODING_TYPE_BYPASS});
                        supportedBoxDecodingTypes.insert({"SSD_ANCHOR", INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR});
+                       supportedBoxDecodingTypes.insert({"YOLO_ANCHOR", INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR});
                }
 
                ~BoxInfo() = default;
@@ -237,20 +238,28 @@ namespace box
                                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                }
 
+                               int ret = MEDIA_VISION_ERROR_NONE;
                                JsonObject *cObject = json_object_get_object_member(pObject, "decoding_info");
-                               if (!json_object_has_member(cObject, "anchor")) {
+                               if (json_object_has_member(cObject, "anchor")) {
+                                       ret = GetDecodeInfo().ParseAnchorParam(cObject);
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to ParseAnchorParam[%d]", ret);
+                                               return ret;
+                                       }
+                               } else if (json_object_has_member(cObject, "cell")) {
+                                       ret = GetDecodeInfo().ParseCellParam(cObject);
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to ParseCellParam[%d]", ret);
+                                               return ret;
+                                       }
+                               } else {
+
                                        LOGE("anchor is mandatory. Invalid metadata");
                                        LOGI("LEAVE");
 
                                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                }
 
-                               int ret = GetDecodeInfo().ParseAnchorParam(cObject);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to ParseAnchorParam[%d]", ret);
-                                       return ret;
-                               }
-
                                ret = GetDecodeInfo().ParseNms(cObject);
                                if (ret != MEDIA_VISION_ERROR_NONE) {
                                        LOGE("Fail to ParseNms[%d]", ret);
index 7cdbca4f8d820295bbd9bd4eba5fc5354efafa8f..5a478b3c2bf5d83b6cd9d6d08a911ce686464fb9 100644 (file)
@@ -53,6 +53,11 @@ namespace box
                float hScale;
        };
 
+       struct CellParam {
+               int numScales;
+               std::vector<int> scales;
+       };
+
        struct NMSParam {
                inference_box_nms_type_e mode; /**< 0: standard */
                float iouThreshold;
@@ -83,6 +88,7 @@ namespace box
        private:
                AnchorParam anchorParam;
                std::vector<cv::Rect2f> anchorBoxes;
+               CellParam cellParam;
                NMSParam nmsParam;
                RotateParam rotParam;
                RoiOptionParam roiOptParam;
@@ -128,6 +134,11 @@ namespace box
                float GetAnchorHscale();
                float CalculateScale(float min, float max, int index, int maxStride);
 
+               // Cell param
+               int ParseCellParam(JsonObject *root);
+               std::vector<int>& GetCellScalesAll();
+               int GetCellNumScales();
+
                // Nms param
                int ParseNms(JsonObject *root);
                int GetNmsMode();
index 7ce558b4b6c06b2f6cfede14397d6d6d77734af1..10fe9200f3879a93e82e8831eb8c903b4acdc7ec 100644 (file)
@@ -46,6 +46,7 @@ namespace inference
        typedef enum {
                INFERENCE_BOX_DECODING_TYPE_BYPASS,
                INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR,
+               INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
        } inference_box_decoding_type_e;
 
        typedef enum {
index 8cb63c880e5287873cc5a5b947c9cd4185f6b445..b23b7338d89d4636cbf4b47036b691b383b20200 100755 (executable)
@@ -1237,7 +1237,7 @@ namespace inference
                                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                }
                                boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
-                       } else {
+                       } else if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                                std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
                                if (boxIndexes.size() != 1) {
                                        LOGE("Invalid dim size. It should be 1");
@@ -1251,6 +1251,14 @@ namespace inference
                                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                }
                                numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+                       } else { // INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
+                               std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
+                               if (boxIndexes.size() != 1) {
+                                       LOGE("Invalid dim size. It should be 1");
+                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                               }
+                               boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+                               numberOfObjects = boxOffset - 5;
                        }
 
                        ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
index e631ff02799469a0835eaf587130f99294f835dc..3c5d8bb48c1bb4bd231bb3f3673a3374a72a10f1 100755 (executable)
@@ -46,11 +46,13 @@ namespace inference
                        // Otherwise it is set already within ctor.
                        mNumberOfOjects = mTensorBuffer.getValue<int>(
                                                                mMeta.GetBoxNumberName(), indexes[0]);
-               } else {
+               } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                        if (mMeta.GetBoxDecodeInfo().IsAnchorBoxEmpty()) {
                                LOGE("Anchor boxes are required but empty.");
                                return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
+               } else {
+                       LOGI("YOLO_ANCHOR does nothing");
                }
 
                return MEDIA_VISION_ERROR_NONE;
@@ -143,10 +145,24 @@ namespace inference
 
        int ObjectDecoder::decode()
        {
+               LOGI("ENTER");
+
                BoxesList boxList;
+               Boxes boxes;
                int ret = MEDIA_VISION_ERROR_NONE;
+               int totalIdx = mNumberOfOjects;
+
+               if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+                       totalIdx = 0;
+                       for (auto& scale : mMeta.GetBoxDecodeInfo().GetCellScalesAll()) {
+                               totalIdx += (static_cast<int>(mScaleW) / scale
+                                                       * static_cast<int>(mScaleH) / scale)
+                                                       * mMeta.GetBoxDecodeInfo().GetCellNumScales();
+                       }
+                       boxList.reserve(mNumberOfOjects);
+               }
 
-               for (int idx = 0; idx < mNumberOfOjects; ++idx) {
+               for (int idx = 0; idx < totalIdx; ++idx) {
                        if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_BYPASS) {
                                float score = decodeScore(idx);
                                if (score <= 0.0f)
@@ -154,10 +170,10 @@ namespace inference
 
                                Box box = decodeBox(idx, score);
                                mResultBoxes.push_back(box);
-                       } else {
+                       } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                                int anchorIdx = -1;
 
-                               Boxes boxes;
+                               boxes.clear();
                                for (auto& anchorBox : mMeta.GetBoxDecodeInfo().GetAnchorBoxAll()) {
                                        anchorIdx++;
 
@@ -170,9 +186,35 @@ namespace inference
                                        boxes.push_back(box);
                                }
                                boxList.push_back(boxes);
+                       } else { // INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
+                               int cellIdx = idx * mBoxOffset;
+                               float score = decodeScore(cellIdx + 4);
+                               if (score <= 0.0f) {
+                                       continue;
+                               }
+                               // need to check the score
+                               float objScore = 0.0f;
+                               int objIdx = 0;
+                               for (int objIdx_ = 0; objIdx_ < mNumberOfOjects; ++objIdx_) {
+                                       float objScore_ = decodeScore(cellIdx + 5 + objIdx_);
+                                       if (objScore_ > objScore) {
+                                               objScore = objScore_;
+                                               objIdx = objIdx_;
+                                       }
+                               }
+
+                               if (objScore <  mMeta.GetScoreThreshold())
+                                       continue;
+
+                               Box box = decodeBox(idx, objScore, objIdx);
+                               boxes.push_back(box);
                        }
                }
 
+               if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR ||
+                       mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+                       boxList.push_back(boxes);
+
                if (!boxList.empty()) {
                        PostProcess postProc;
                        ret = postProc.Nms(boxList,
@@ -183,8 +225,12 @@ namespace inference
                                LOGE("Fail to non-maximum suppression[%d]", ret);
                                return ret;
                        }
+               } else {
+                       LOGW("boxlist empty!");
                }
 
+               LOGI("LEAVE");
+
                return ret;
        }
 
index 391b265cb250c3b0fe4cc4d61307bd3f3fa0fb95..24cd8b2ca85e40d829707f239b8137af8f92f079 100755 (executable)
@@ -129,6 +129,32 @@ namespace inference
                return MEDIA_VISION_ERROR_NONE;
        }
 
+       int DecodeInfo::ParseCellParam(JsonObject *root)
+       {
+               JsonObject *object = json_object_get_object_member(root, "cell") ;
+
+               this->cellParam.numScales = static_cast<int>(json_object_get_int_member(object, "num_scales"));
+
+               JsonArray * array = json_object_get_array_member(object, "scales");
+               unsigned int elements2 = json_array_get_length(array);
+               for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                       auto scale = static_cast<int>(json_array_get_int_element(array, elem2));
+                       this->cellParam.scales.push_back(scale);
+                       LOGI("scale: %d", scale);
+               }
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       std::vector<int>& DecodeInfo::GetCellScalesAll()
+       {
+               return this->cellParam.scales;
+       }
+
+       int DecodeInfo::GetCellNumScales()
+       {
+               return this->cellParam.numScales;
+       }
+
        float DecodeInfo::CalculateScale(float min, float max, int index, int maxStride)
        {
                return min + (max - min) * 1.0 * index / (maxStride - 1.0f);
@@ -443,21 +469,20 @@ namespace inference
                                        return ret;
                                }
 
-                       } else if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+                       } else {
                                ret = box.ParseDecodeInfo(root);
                                if (ret != MEDIA_VISION_ERROR_NONE) {
                                        LOGE("Fail to GetBoxDecodeInfo[%d]", ret);
                                        return ret;
                                }
 
-                               ret = box.GetDecodeInfo().GenerateAnchor();
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GenerateAnchor[%d]", ret);
-                                       return ret;
+                               if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+                                       ret = box.GetDecodeInfo().GenerateAnchor();
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to GenerateAnchor[%d]", ret);
+                                               return ret;
+                                       }
                                }
-
-                       } else {
-                               LOGW("Unknow box decoding type. Ignore");
                        }
                }
 
index 7bb69887994714eabce048703efac0d3b7417cde..0cc1d124f34be171b9d98731dfa036b68f5b1817 100644 (file)
@@ -1,6 +1,6 @@
 Name:        capi-media-vision
 Summary:     Media Vision library for Tizen Native API
-Version:     0.12.3
+Version:     0.13.0
 Release:     0
 Group:       Multimedia/Framework
 License:     Apache-2.0 and BSD-3-Clause
index 681afab99c8a0addb0576235e1fc4536137b2a1c..c5b13edbb40d18ec1e68c7f2d99d3f9fbb5dd60e 100644 (file)
        "/usr/share/capi-media-vision/models/OD/tflite/od_efficientdet.tflite"
 #define OD_TFLITE_META_QUANT_EFFICIENT_PATH \
        "/usr/share/capi-media-vision/models/OD/tflite/od_efficientdet.json"
+#define OD_TFLITE_WEIGHT_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_320x320.tflite"
+#define OD_TFLITE_META_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_320x320.json"
+#define OD_LABLE_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_label.txt"
 
 //Face Detection
 #define FD_TFLITE_WEIGHT_PATH \
@@ -1271,6 +1277,7 @@ int perform_object_detection()
                "Hosted[o]: TFLite(cpu + MobilenetV1+SSD)",
                "Hosted[o]: TFLite(cpu + MobilenetV2+SSD)",
                "Hosted[o]: TFLite(Quant + EfficientDet)",
+               "Hosted[o]: TFLite(cpu + YoloV5)"
        };
 
        int sel_opt = show_menu_linear("Select Action:", names, ARRAY_SIZE(names));
@@ -1309,6 +1316,12 @@ int perform_object_detection()
                                OD_LABEL_QUANT_EFFICIENT_PATH,
                                OD_TFLITE_META_QUANT_EFFICIENT_PATH);
        } break;
+       case 7: {
+               err = engine_config_user_hosted_tflite_cpu(
+                               engine_cfg, OD_TFLITE_WEIGHT_YOLO_V5_320_PATH,
+                               OD_LABLE_YOLO_V5_320_PATH,
+                               OD_TFLITE_META_YOLO_V5_320_PATH);
+       } break;
        }
        if (err != MEDIA_VISION_ERROR_NONE) {
                printf("Fail to perform config [err:%i]\n", err);