mv_machine_learning: support YOLO-V5 inference

author Tae-Young Chung <ty83.chung@samsung.com>

Tue, 25 Jan 2022 07:18:19 +0000 (16:18 +0900)

committer Tae-Young Chung <ty83.chung@samsung.com>

Tue, 25 Jan 2022 07:18:48 +0000 (16:18 +0900)
author Tae-Young Chung <ty83.chung@samsung.com>
Tue, 25 Jan 2022 07:18:19 +0000 (16:18 +0900)
committer Tae-Young Chung <ty83.chung@samsung.com>
Tue, 25 Jan 2022 07:18:48 +0000 (16:18 +0900)
diff --git a/meta-template/od_yolo_v5_320x320.json b/meta-template/od_yolo_v5_320x320.json

new file mode 100644 (file)

index 0000000..84793b7
--- /dev/null
+++ b/meta-template/od_yolo_v5_320x320.json
@@ -0,0 +1,59 @@
+{
+    "inputmetadata" :
+    {
+        "tensor_info" : [
+            {
+                "name" : "input_1:0",
+                "shape_type" : "NHWC",
+                "shape_dims" : [ 1, 320, 320, 3],
+                "data_type" : "FLOAT32",
+                "color_space" : "RGB888"
+            }
+        ],
+        "preprocess" : [
+            {
+                "normalization" : [
+                    {
+                        "mean" : [0.0, 0.0, 0.0],
+                        "std" : [255.0, 255.0, 255.0]
+                    }
+                ]
+            }
+        ]
+    },
+    "outputmetadata" :
+    {
+        "score" : [
+            {
+                "name" : "Identity:0",
+                "index" : [-1, -1, 1],
+                "top_number" : 5,
+                "threshold" : 0.4,
+                "score_type" : "NORMAL"
+            }
+        ],
+        "box" : [
+            {
+                "name" : "Identity:0",
+                "index" : [-1, -1, 1],
+                "box_type" : "ORIGIN_CENTER",
+                "box_order" : [0, 1, 2, 3],
+                "box_coordinate" : "RATIO",
+                "decoding_type" : "YOLO_ANCHOR",
+                "decoding_info" :
+                {
+                    "cell" :
+                    {
+                        "num_scales" : 3,
+                        "scales": [8, 16, 32]
+                    },
+                    "nms" :
+                    {
+                        "mode": "STANDARD",
+                        "iou_threshold": 0.2
+                    }
+                }
+            }
+        ]
+    }
+}
diff --git a/mv_machine_learning/mv_inference/inference/include/BoxInfo.h b/mv_machine_learning/mv_inference/inference/include/BoxInfo.h

index 63bc28d21465343bf357786d64ace020b9cd9f89..d384ecd98531d81cd62f6ad74c3af9016c5da9af 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/include/BoxInfo.h
+++ b/mv_machine_learning/mv_inference/inference/include/BoxInfo.h
@@ -84,6 +84,7 @@ namespace box
  
                         supportedBoxDecodingTypes.insert({"BYPASS", INFERENCE_BOX_DECODING_TYPE_BYPASS});
                         supportedBoxDecodingTypes.insert({"SSD_ANCHOR", INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR});
+                       supportedBoxDecodingTypes.insert({"YOLO_ANCHOR", INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR});
                 }
  
                 ~BoxInfo() = default;
@@ -237,20 +238,28 @@ namespace box
                                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                 }
  
+                               int ret = MEDIA_VISION_ERROR_NONE;
                                 JsonObject *cObject = json_object_get_object_member(pObject, "decoding_info");
-                               if (!json_object_has_member(cObject, "anchor")) {
+                               if (json_object_has_member(cObject, "anchor")) {
+                                       ret = GetDecodeInfo().ParseAnchorParam(cObject);
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to ParseAnchorParam[%d]", ret);
+                                               return ret;
+                                       }
+                               } else if (json_object_has_member(cObject, "cell")) {
+                                       ret = GetDecodeInfo().ParseCellParam(cObject);
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to ParseCellParam[%d]", ret);
+                                               return ret;
+                                       }
+                               } else {
+
                                         LOGE("anchor is mandatory. Invalid metadata");
                                         LOGI("LEAVE");
  
                                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                 }
  
-                               int ret = GetDecodeInfo().ParseAnchorParam(cObject);
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to ParseAnchorParam[%d]", ret);
-                                       return ret;
-                               }
-
                                 ret = GetDecodeInfo().ParseNms(cObject);
                                 if (ret != MEDIA_VISION_ERROR_NONE) {
                                         LOGE("Fail to ParseNms[%d]", ret);
diff --git a/mv_machine_learning/mv_inference/inference/include/DecodeInfo.h b/mv_machine_learning/mv_inference/inference/include/DecodeInfo.h

index 7cdbca4f8d820295bbd9bd4eba5fc5354efafa8f..5a478b3c2bf5d83b6cd9d6d08a911ce686464fb9 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/include/DecodeInfo.h
+++ b/mv_machine_learning/mv_inference/inference/include/DecodeInfo.h
@@ -53,6 +53,11 @@ namespace box
                 float hScale;
         };
  
+       struct CellParam {
+               int numScales;
+               std::vector<int> scales;
+       };
+
         struct NMSParam {
                 inference_box_nms_type_e mode; /**< 0: standard */
                 float iouThreshold;
@@ -83,6 +88,7 @@ namespace box
         private:
                 AnchorParam anchorParam;
                 std::vector<cv::Rect2f> anchorBoxes;
+               CellParam cellParam;
                 NMSParam nmsParam;
                 RotateParam rotParam;
                 RoiOptionParam roiOptParam;
@@ -128,6 +134,11 @@ namespace box
                 float GetAnchorHscale();
                 float CalculateScale(float min, float max, int index, int maxStride);
  
+               // Cell param
+               int ParseCellParam(JsonObject *root);
+               std::vector<int>& GetCellScalesAll();
+               int GetCellNumScales();
+
                 // Nms param
                 int ParseNms(JsonObject *root);
                 int GetNmsMode();
diff --git a/mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h b/mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h

index 7ce558b4b6c06b2f6cfede14397d6d6d77734af1..10fe9200f3879a93e82e8831eb8c903b4acdc7ec 100644 (file)
--- a/mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h
+++ b/mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h
@@ -46,6 +46,7 @@ namespace inference
         typedef enum {
                 INFERENCE_BOX_DECODING_TYPE_BYPASS,
                 INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR,
+               INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
         } inference_box_decoding_type_e;
  
         typedef enum {
diff --git a/mv_machine_learning/mv_inference/inference/src/Inference.cpp b/mv_machine_learning/mv_inference/inference/src/Inference.cpp

index 8cb63c880e5287873cc5a5b947c9cd4185f6b445..b23b7338d89d4636cbf4b47036b691b383b20200 100755 (executable)
--- a/mv_machine_learning/mv_inference/inference/src/Inference.cpp
+++ b/mv_machine_learning/mv_inference/inference/src/Inference.cpp
@@ -1237,7 +1237,7 @@ namespace inference
                                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                 }
                                 boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
-                       } else {
+                       } else if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                                 std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
                                 if (boxIndexes.size() != 1) {
                                         LOGE("Invalid dim size. It should be 1");
@@ -1251,6 +1251,14 @@ namespace inference
                                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
                                 }
                                 numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+                       } else { // INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
+                               std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
+                               if (boxIndexes.size() != 1) {
+                                       LOGE("Invalid dim size. It should be 1");
+                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                               }
+                               boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+                               numberOfObjects = boxOffset - 5;
                         }
  
                         ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
diff --git a/mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp b/mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp

index e631ff02799469a0835eaf587130f99294f835dc..3c5d8bb48c1bb4bd231bb3f3673a3374a72a10f1 100755 (executable)
--- a/mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp
+++ b/mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp
@@ -46,11 +46,13 @@ namespace inference
                         // Otherwise it is set already within ctor.
                         mNumberOfOjects = mTensorBuffer.getValue<int>(
                                                                 mMeta.GetBoxNumberName(), indexes[0]);
-               } else {
+               } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                         if (mMeta.GetBoxDecodeInfo().IsAnchorBoxEmpty()) {
                                 LOGE("Anchor boxes are required but empty.");
                                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
                         }
+               } else {
+                       LOGI("YOLO_ANCHOR does nothing");
                 }
  
                 return MEDIA_VISION_ERROR_NONE;
@@ -143,10 +145,24 @@ namespace inference
  
         int ObjectDecoder::decode()
         {
+               LOGI("ENTER");
+
                 BoxesList boxList;
+               Boxes boxes;
                 int ret = MEDIA_VISION_ERROR_NONE;
+               int totalIdx = mNumberOfOjects;
+
+               if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+                       totalIdx = 0;
+                       for (auto& scale : mMeta.GetBoxDecodeInfo().GetCellScalesAll()) {
+                               totalIdx += (static_cast<int>(mScaleW) / scale
+                                                       * static_cast<int>(mScaleH) / scale)
+                                                       * mMeta.GetBoxDecodeInfo().GetCellNumScales();
+                       }
+                       boxList.reserve(mNumberOfOjects);
+               }
  
-               for (int idx = 0; idx < mNumberOfOjects; ++idx) {
+               for (int idx = 0; idx < totalIdx; ++idx) {
                         if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_BYPASS) {
                                 float score = decodeScore(idx);
                                 if (score <= 0.0f)
@@ -154,10 +170,10 @@ namespace inference
  
                                 Box box = decodeBox(idx, score);
                                 mResultBoxes.push_back(box);
-                       } else {
+                       } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
                                 int anchorIdx = -1;
  
-                               Boxes boxes;
+                               boxes.clear();
                                 for (auto& anchorBox : mMeta.GetBoxDecodeInfo().GetAnchorBoxAll()) {
                                         anchorIdx++;
  
@@ -170,9 +186,35 @@ namespace inference
                                         boxes.push_back(box);
                                 }
                                 boxList.push_back(boxes);
+                       } else { // INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR
+                               int cellIdx = idx * mBoxOffset;
+                               float score = decodeScore(cellIdx + 4);
+                               if (score <= 0.0f) {
+                                       continue;
+                               }
+                               // need to check the score
+                               float objScore = 0.0f;
+                               int objIdx = 0;
+                               for (int objIdx_ = 0; objIdx_ < mNumberOfOjects; ++objIdx_) {
+                                       float objScore_ = decodeScore(cellIdx + 5 + objIdx_);
+                                       if (objScore_ > objScore) {
+                                               objScore = objScore_;
+                                               objIdx = objIdx_;
+                                       }
+                               }
+
+                               if (objScore <  mMeta.GetScoreThreshold())
+                                       continue;
+
+                               Box box = decodeBox(idx, objScore, objIdx);
+                               boxes.push_back(box);
                         }
                 }
  
+               if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR ||
+                       mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+                       boxList.push_back(boxes);
+
                 if (!boxList.empty()) {
                         PostProcess postProc;
                         ret = postProc.Nms(boxList,
@@ -183,8 +225,12 @@ namespace inference
                                 LOGE("Fail to non-maximum suppression[%d]", ret);
                                 return ret;
                         }
+               } else {
+                       LOGW("boxlist empty!");
                 }
  
+               LOGI("LEAVE");
+
                 return ret;
         }
  
diff --git a/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp b/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp

index 391b265cb250c3b0fe4cc4d61307bd3f3fa0fb95..24cd8b2ca85e40d829707f239b8137af8f92f079 100755 (executable)
--- a/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp
+++ b/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp
@@ -129,6 +129,32 @@ namespace inference
                 return MEDIA_VISION_ERROR_NONE;
         }
  
+       int DecodeInfo::ParseCellParam(JsonObject *root)
+       {
+               JsonObject *object = json_object_get_object_member(root, "cell") ;
+
+               this->cellParam.numScales = static_cast<int>(json_object_get_int_member(object, "num_scales"));
+
+               JsonArray * array = json_object_get_array_member(object, "scales");
+               unsigned int elements2 = json_array_get_length(array);
+               for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
+                       auto scale = static_cast<int>(json_array_get_int_element(array, elem2));
+                       this->cellParam.scales.push_back(scale);
+                       LOGI("scale: %d", scale);
+               }
+               return MEDIA_VISION_ERROR_NONE;
+       }
+
+       std::vector<int>& DecodeInfo::GetCellScalesAll()
+       {
+               return this->cellParam.scales;
+       }
+
+       int DecodeInfo::GetCellNumScales()
+       {
+               return this->cellParam.numScales;
+       }
+
         float DecodeInfo::CalculateScale(float min, float max, int index, int maxStride)
         {
                 return min + (max - min) * 1.0 * index / (maxStride - 1.0f);
@@ -443,21 +469,20 @@ namespace inference
                                         return ret;
                                 }
  
-                       } else if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+                       } else {
                                 ret = box.ParseDecodeInfo(root);
                                 if (ret != MEDIA_VISION_ERROR_NONE) {
                                         LOGE("Fail to GetBoxDecodeInfo[%d]", ret);
                                         return ret;
                                 }
  
-                               ret = box.GetDecodeInfo().GenerateAnchor();
-                               if (ret != MEDIA_VISION_ERROR_NONE) {
-                                       LOGE("Fail to GenerateAnchor[%d]", ret);
-                                       return ret;
+                               if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+                                       ret = box.GetDecodeInfo().GenerateAnchor();
+                                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                                               LOGE("Fail to GenerateAnchor[%d]", ret);
+                                               return ret;
+                                       }
                                 }
-
-                       } else {
-                               LOGW("Unknow box decoding type. Ignore");
                         }
                 }
  
diff --git a/packaging/capi-media-vision.spec b/packaging/capi-media-vision.spec

index 7bb69887994714eabce048703efac0d3b7417cde..0cc1d124f34be171b9d98731dfa036b68f5b1817 100644 (file)
--- a/packaging/capi-media-vision.spec
+++ b/packaging/capi-media-vision.spec
@@ -1,6 +1,6 @@
  Name:        capi-media-vision
  Summary:     Media Vision library for Tizen Native API
-Version:     0.12.3
+Version:     0.13.0
  Release:     0
  Group:       Multimedia/Framework
  License:     Apache-2.0 and BSD-3-Clause
diff --git a/test/testsuites/machine_learning/inference/inference_test_suite.c b/test/testsuites/machine_learning/inference/inference_test_suite.c

index 681afab99c8a0addb0576235e1fc4536137b2a1c..c5b13edbb40d18ec1e68c7f2d99d3f9fbb5dd60e 100644 (file)
--- a/test/testsuites/machine_learning/inference/inference_test_suite.c
+++ b/test/testsuites/machine_learning/inference/inference_test_suite.c
@@ -148,6 +148,12 @@
         "/usr/share/capi-media-vision/models/OD/tflite/od_efficientdet.tflite"
  #define OD_TFLITE_META_QUANT_EFFICIENT_PATH \
         "/usr/share/capi-media-vision/models/OD/tflite/od_efficientdet.json"
+#define OD_TFLITE_WEIGHT_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_320x320.tflite"
+#define OD_TFLITE_META_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_320x320.json"
+#define OD_LABLE_YOLO_V5_320_PATH \
+       "/usr/share/capi-media-vision/models/OD/tflite/od_yolo_v5_label.txt"
  
  //Face Detection
  #define FD_TFLITE_WEIGHT_PATH \
@@ -1271,6 +1277,7 @@ int perform_object_detection()
                 "Hosted[o]: TFLite(cpu + MobilenetV1+SSD)",
                 "Hosted[o]: TFLite(cpu + MobilenetV2+SSD)",
                 "Hosted[o]: TFLite(Quant + EfficientDet)",
+               "Hosted[o]: TFLite(cpu + YoloV5)"
         };
  
         int sel_opt = show_menu_linear("Select Action:", names, ARRAY_SIZE(names));
@@ -1309,6 +1316,12 @@ int perform_object_detection()
                                 OD_LABEL_QUANT_EFFICIENT_PATH,
                                 OD_TFLITE_META_QUANT_EFFICIENT_PATH);
         } break;
+       case 7: {
+               err = engine_config_user_hosted_tflite_cpu(
+                               engine_cfg, OD_TFLITE_WEIGHT_YOLO_V5_320_PATH,
+                               OD_LABLE_YOLO_V5_320_PATH,
+                               OD_TFLITE_META_YOLO_V5_320_PATH);
+       } break;
         }
         if (err != MEDIA_VISION_ERROR_NONE) {
                 printf("Fail to perform config [err:%i]\n", err);
author	Tae-Young Chung <ty83.chung@samsung.com>
	Tue, 25 Jan 2022 07:18:19 +0000 (16:18 +0900)
committer	Tae-Young Chung <ty83.chung@samsung.com>
	Tue, 25 Jan 2022 07:18:48 +0000 (16:18 +0900)
meta-template/od_yolo_v5_320x320.json	[new file with mode: 0644]	patch \| blob
mv_machine_learning/mv_inference/inference/include/BoxInfo.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/include/DecodeInfo.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/include/OutputMetadataTypes.h		patch \| blob \| history
mv_machine_learning/mv_inference/inference/src/Inference.cpp		patch \| blob \| history
mv_machine_learning/mv_inference/inference/src/ObjectDecoder.cpp		patch \| blob \| history
mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp		patch \| blob \| history
packaging/capi-media-vision.spec		patch \| blob \| history
test/testsuites/machine_learning/inference/inference_test_suite.c		patch \| blob \| history