Add decode yolo

author Kwanghoon Son <k.son@samsung.com>

Thu, 22 Sep 2022 02:05:36 +0000 (22:05 -0400)

committer Tae-Young Chung <ty83.chung@samsung.com>

Wed, 9 Nov 2022 02:41:02 +0000 (02:41 +0000)
author Kwanghoon Son <k.son@samsung.com>
Thu, 22 Sep 2022 02:05:36 +0000 (22:05 -0400)
committer Tae-Young Chung <ty83.chung@samsung.com>
Wed, 9 Nov 2022 02:41:02 +0000 (02:41 +0000)
diff --git a/mv_machine_learning/inference/include/DecodeInfo.h b/mv_machine_learning/inference/include/DecodeInfo.h

index 6a38e54..91f45e1 100644 (file)
--- a/mv_machine_learning/inference/include/DecodeInfo.h
+++ b/mv_machine_learning/inference/include/DecodeInfo.h
@@ -60,6 +60,7 @@ struct AnchorParam
         std::map<std::string, inference_score_type_e> supportedCellType;
         std::vector<std::vector<double> > vxScales;
         std::vector<std::vector<double> > vyScales;
+       int totalAnchors;
  };
  
  struct NMSParam
@@ -95,6 +96,7 @@ struct DecodeInfo
  {
         AnchorParam anchorParam;
         std::vector<cv::Rect2f> anchorBoxes;
+       std::vector<std::vector<cv::Rect2f> > vAnchorBoxes; // (stride) * (H * W * B) * (rect)
         NMSParam nmsParam;
         RotateParam rotParam;
         RoiOptionParam roiOptParam;
@@ -132,6 +134,7 @@ struct DecodeInfo
         // Anchor param
         int ParseAnchorParam(JsonObject *root);
         int GenerateAnchor();
+       int GenerateYOLOAnchor();
         bool IsFixedAnchorSize();
         bool IsExponentialBoxScale();
         float GetAnchorXscale();
diff --git a/mv_machine_learning/inference/include/ObjectDecoder.h b/mv_machine_learning/inference/include/ObjectDecoder.h

index 50817d7..84daa41 100644 (file)
--- a/mv_machine_learning/inference/include/ObjectDecoder.h
+++ b/mv_machine_learning/inference/include/ObjectDecoder.h
@@ -50,7 +50,10 @@ private:
  
         float decodeScore(int idx);
         Box decodeBox(int idx, float score, int label = -1, int offset = 0);
+       Box decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx);
         Box decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::Rect2f &anchor);
+       void decodeYOLO(BoxesList &boxList);
+       float decodeYOLOScore(int idx, int nameIdx);
  
  public:
         ObjectDecoder(TensorBuffer &buffer, OutputMetadata &metaData, int boxOffset, float scaleW, float scaleH,
diff --git a/mv_machine_learning/inference/src/DecodeInfo.cpp b/mv_machine_learning/inference/src/DecodeInfo.cpp

index f33f062..846a98e 100644 (file)
--- a/mv_machine_learning/inference/src/DecodeInfo.cpp
+++ b/mv_machine_learning/inference/src/DecodeInfo.cpp
@@ -160,6 +160,39 @@ float DecodeInfo::GetAnchorHscale()
         return anchorParam.hScale;
  }
  
+/**
+ * @ref https://wikidocs.net/163607
+ */
+int DecodeInfo::GenerateYOLOAnchor()
+{
+       LOGI("ENTER");
+       int stride_idx = 0;
+       anchorParam.totalAnchors = 0;
+       vAnchorBoxes.resize(anchorParam.strides.size());
+       for (auto &stride : anchorParam.strides) {
+               auto gridHeight = anchorParam.inputSizeHeight / stride;
+               auto gridWidth = anchorParam.inputSizeWidth / stride;
+
+               for (int y = 0; y < gridHeight; ++y) {
+                       for (int x = 0; x < gridWidth; ++x) {
+                               for (int anchorPerCell = 0; anchorPerCell < 3; ++anchorPerCell) {
+                                       cv::Rect2f anchor = {
+                                               cv::Point2f { (static_cast<float>(x) + anchorParam.anchorOffsetX),
+                                                                         (static_cast<float>(y) + anchorParam.anchorOffsetY) },
+                                               cv::Size2f { anchorParam.vxScales[stride_idx][anchorPerCell] * static_cast<float>(stride),
+                                                                        anchorParam.vyScales[stride_idx][anchorPerCell] * static_cast<float>(stride) }
+                                       };
+                                       vAnchorBoxes[stride_idx].push_back(anchor);
+                               }
+                       }
+               }
+               anchorParam.totalAnchors += vAnchorBoxes[stride_idx].size();
+               stride_idx++;
+       }
+
+       LOGI("LEAVE");
+       return MEDIA_VISION_ERROR_NONE;
+}
  int DecodeInfo::GenerateAnchor()
  {
         if (anchorParam.strides.empty() || anchorParam.aspectRatios.empty()) {
diff --git a/mv_machine_learning/inference/src/Inference.cpp b/mv_machine_learning/inference/src/Inference.cpp

index 18f6544..d5d3c68 100644 (file)
--- a/mv_machine_learning/inference/src/Inference.cpp
+++ b/mv_machine_learning/inference/src/Inference.cpp
@@ -352,11 +352,15 @@ void Inference::ConfigureOutputInfo(const std::vector<std::string> names,
         OutputMetadata &outputMeta = mMetadata.GetOutputMeta();
         if (mMetadata.GetOutputMeta().IsParsed()) {
                 mConfig.mOutputLayerNames.clear();
-               if (!outputMeta.GetScoreName().empty())
-                       mConfig.mOutputLayerNames.push_back(outputMeta.GetScoreName());
+               if (!outputMeta.GetScoreName().empty()) {
+                       for (auto &name : outputMeta.score.names)
+                               mConfig.mOutputLayerNames.push_back(name);
+               }
  
-               if (!outputMeta.GetBoxName().empty())
-                       mConfig.mOutputLayerNames.push_back(outputMeta.GetBoxName());
+               if (!outputMeta.GetBoxName().empty()) {
+                       for (auto &name : outputMeta.box.names)
+                               mConfig.mOutputLayerNames.push_back(name);
+               }
  
                 if (!outputMeta.GetBoxLabelName().empty())
                         mConfig.mOutputLayerNames.push_back(outputMeta.GetBoxLabelName());
diff --git a/mv_machine_learning/inference/src/ObjectDecoder.cpp b/mv_machine_learning/inference/src/ObjectDecoder.cpp

index 82939f2..cf5f283 100644 (file)
--- a/mv_machine_learning/inference/src/ObjectDecoder.cpp
+++ b/mv_machine_learning/inference/src/ObjectDecoder.cpp
@@ -20,6 +20,8 @@
  #include <unistd.h>
  #include <fstream>
  #include <string>
+#include <iostream>
+using namespace std;
  
  namespace mediavision
  {
@@ -132,6 +134,116 @@ Box ObjectDecoder::decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::
         return box;
  }
  
+float ObjectDecoder::decodeYOLOScore(int idx, int nameIdx)
+{
+       float score = mTensorBuffer.getValue<float>(mMeta.score.names[nameIdx], idx);
+       if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+               score = PostProcess::sigmoid(score);
+       }
+
+       return score;
+}
+Box ObjectDecoder::decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx)
+{
+       std::vector<std::string> &boxNames = mMeta.box.names;
+       // assume type is (cx,cy,w,h)
+       // left or cx
+       float cx = mTensorBuffer.getValue<float>(boxNames[nameIdx], idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]);
+       // top or cy
+       float cy = mTensorBuffer.getValue<float>(boxNames[nameIdx], idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]);
+       // right or width
+       float cWidth = mTensorBuffer.getValue<float>(boxNames[nameIdx], idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]);
+       // bottom or height
+       float cHeight =
+                       mTensorBuffer.getValue<float>(boxNames[nameIdx], idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]);
+
+       if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+               cx = PostProcess::sigmoid(cx);
+               cy = PostProcess::sigmoid(cy);
+               cWidth = PostProcess::sigmoid(cWidth);
+               cHeight = PostProcess::sigmoid(cHeight);
+       }
+
+       LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight);
+       // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP
+       if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) {
+               float tmpCx = cx;
+               float tmpCy = cy;
+               cx = (cx + cWidth) * 0.5f; // (left + right)/2
+               cy = (cy + cHeight) * 0.5f; // (top + bottom)/2
+               cWidth = cWidth - tmpCx; // right - left
+               cHeight = cHeight - tmpCy; // bottom - top
+       }
+
+       // convert coordinate to RATIO if PIXEL
+       if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) {
+               cx /= mScaleW;
+               cy /= mScaleH;
+               cWidth /= mScaleW;
+               cHeight /= mScaleH;
+       }
+
+       Box box = { .index = mMeta.GetBoxLabelName().empty() ? label :
+                                                                                                                  mTensorBuffer.getValue<int>(mMeta.GetBoxLabelName(), idx),
+                               .score = score,
+                               .location = cv::Rect2f(cx, cy, cWidth, cHeight) };
+
+       return box;
+}
+void ObjectDecoder::decodeYOLO(BoxesList &boxesList)
+{
+       box::DecodeInfo &decodeInfo = mMeta.GetBoxDecodeInfo();
+       box::AnchorParam &yoloAnchor = decodeInfo.anchorParam;
+
+       //offsetAnchors is 3 which is number of BOX
+       mNumberOfOjects = mBoxOffset / yoloAnchor.offsetAnchors - 5;
+       boxesList.resize(mNumberOfOjects);
+
+       int strideIdx = -1;
+       for (auto &stride : yoloAnchor.strides) {
+               //for each stride
+               strideIdx++;
+
+               int startAnchorIdx = 0;
+               int endAnchorIdx =
+                               (static_cast<int>(mScaleW) / stride * static_cast<int>(mScaleH) / stride) * yoloAnchor.offsetAnchors;
+
+               for (int anchorIdx = startAnchorIdx; anchorIdx < endAnchorIdx; anchorIdx += yoloAnchor.offsetAnchors) {
+                       // for each grid cell
+                       for (int offset = 0; offset < yoloAnchor.offsetAnchors; ++offset) {
+                               //for each BOX
+                               //handle order is (H,W,A)
+                               float boxScore =
+                                               decodeYOLOScore(anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 4, strideIdx);
+
+                               auto anchorBox = decodeInfo.vAnchorBoxes[strideIdx][anchorIdx + offset];
+
+                               for (int objIdx = 0; objIdx < mNumberOfOjects; ++objIdx) { //each box to every object
+                                       float objScore = decodeYOLOScore(
+                                                       anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 5 + objIdx, strideIdx);
+
+                                       if (boxScore * objScore < mMeta.GetScoreThreshold())
+                                               continue;
+                                       cout << "strideIdx" << strideIdx << ", anchorIdx" << anchorIdx << ", boxScore : " << boxScore
+                                                << ", objScore : " << objScore << ", objIdx" << objIdx << endl;
+
+                                       Box box = decodeYOLOBox(anchorIdx, objScore, objIdx, (mNumberOfOjects + 5) * offset, strideIdx);
+
+                                       if (!decodeInfo.vAnchorBoxes.empty()) {
+                                               box.location.x = (box.location.x * 2 + anchorBox.x) * stride / mScaleW;
+                                               box.location.y = (box.location.y * 2 + anchorBox.y) * stride / mScaleH;
+                                               box.location.width =
+                                                               (box.location.width * 2) * (box.location.width * 2) * anchorBox.width / mScaleW;
+
+                                               box.location.height =
+                                                               (box.location.height * 2) * (box.location.height * 2) * anchorBox.height / mScaleH;
+                                       }
+                                       boxesList[objIdx].push_back(box);
+                               }
+                       }
+               }
+       }
+}
  int ObjectDecoder::decode()
  {
         LOGI("ENTER");
@@ -167,9 +279,11 @@ int ObjectDecoder::decode()
                         boxList.push_back(boxes);
                 }
         }
+       if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+               decodeYOLO(boxList);
+       }
  
-       if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR ||
-               mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+       if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR)
                 boxList.push_back(boxes);
  
         if (!boxList.empty()) {
diff --git a/mv_machine_learning/inference/src/OutputMetadata.cpp b/mv_machine_learning/inference/src/OutputMetadata.cpp

index bdcbce4..edd776f 100644 (file)
--- a/mv_machine_learning/inference/src/OutputMetadata.cpp
+++ b/mv_machine_learning/inference/src/OutputMetadata.cpp
@@ -127,6 +127,12 @@ int OutputMetadata::Parse(JsonObject *root)
                                         LOGE("Fail to GenerateAnchor[%d]", ret);
                                         return ret;
                                 }
+                       } else {
+                               ret = box.GetDecodeInfo().GenerateYOLOAnchor();
+                               if (ret != MEDIA_VISION_ERROR_NONE) {
+                                       LOGE("Fail to GenerateAnchor[%d]", ret);
+                                       return ret;
+                               }
                         }
                 }
         }
diff --git a/test/testsuites/machine_learning/inference/test_object_detection.cpp b/test/testsuites/machine_learning/inference/test_object_detection.cpp

index b3fac94..5c7f599 100644 (file)
--- a/test/testsuites/machine_learning/inference/test_object_detection.cpp
+++ b/test/testsuites/machine_learning/inference/test_object_detection.cpp
@@ -44,6 +44,13 @@
         MV_CONFIG_PATH                                 \
         "/models/OD/snpe/yolov5s_quantize.dlc"
  
+#define OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH \
+       MV_CONFIG_PATH                                \
+       "/models/OD/snpe/label_coco_80.txt"
+
+#define IMG_BUS    \
+       MV_CONFIG_PATH \
+       "/res/inference/images/bus.jpg"
  void _object_detected_cb(mv_source_h source, const int number_of_objects, const int *indices, const char **names,
                                                  const float *confidences, const mv_rectangle_s *locations, void *user_data)
  {
@@ -130,6 +137,13 @@ public:
                 ASSERT_EQ(MediaVision::Common::ImageHelper::loadImageToSource(IMG_DOG, mv_source), MEDIA_VISION_ERROR_NONE);
                 ASSERT_EQ(mv_inference_object_detect(mv_source, infer, _object_detected_cb, &answer), MEDIA_VISION_ERROR_NONE);
         }
+       void inferenceBus(std::string &answer)
+       {
+               TestInference::ConfigureInference();
+
+               ASSERT_EQ(MediaVision::Common::ImageHelper::loadImageToSource(IMG_BUS, mv_source), MEDIA_VISION_ERROR_NONE);
+               ASSERT_EQ(mv_inference_object_detect(mv_source, infer, _object_detected_cb, &answer), MEDIA_VISION_ERROR_NONE);
+       }
  };
  
  TEST_P(TestObjectDetectionSnpe, DISABLED_EFDLite2QC)
@@ -146,10 +160,10 @@ TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor)
  {
         ASSERT_TRUE(_use_json_parser);
         engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH,
-                                                                       OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type);
-
-       inferenceDog();
+                                                                       OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH, _use_json_parser, _target_device_type);
+       std::string answer = "bus";
+       inferenceBus(answer);
  }
  
  INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionSnpe,
-                                               ::testing::Values(ParamTypes(true, MV_INFERENCE_TARGET_DEVICE_CUSTOM)));
-\ No newline at end of file
+                                               ::testing::Values(ParamTypes(true, MV_INFERENCE_TARGET_DEVICE_CUSTOM)));
author	Kwanghoon Son <k.son@samsung.com>
	Thu, 22 Sep 2022 02:05:36 +0000 (22:05 -0400)
committer	Tae-Young Chung <ty83.chung@samsung.com>
	Wed, 9 Nov 2022 02:41:02 +0000 (02:41 +0000)
mv_machine_learning/inference/include/DecodeInfo.h		patch \| blob \| history
mv_machine_learning/inference/include/ObjectDecoder.h		patch \| blob \| history
mv_machine_learning/inference/src/DecodeInfo.cpp		patch \| blob \| history
mv_machine_learning/inference/src/Inference.cpp		patch \| blob \| history
mv_machine_learning/inference/src/ObjectDecoder.cpp		patch \| blob \| history
mv_machine_learning/inference/src/OutputMetadata.cpp		patch \| blob \| history
test/testsuites/machine_learning/inference/test_object_detection.cpp		patch \| blob \| history