Add yolo5s model on SNPE
authorKwanghoon Son <k.son@samsung.com>
Fri, 18 Nov 2022 05:48:56 +0000 (00:48 -0500)
committerKwanghoon Son <k.son@samsung.com>
Wed, 14 Dec 2022 06:32:36 +0000 (15:32 +0900)
[Issue type] new

Change-Id: If3209125970b1e8e2cce9cda9000f89c6617ce2b
Signed-off-by: Kwanghoon Son <k.son@samsung.com>
mv_machine_learning/inference/include/DecodeInfo.h
mv_machine_learning/inference/include/DimInfo.h
mv_machine_learning/inference/include/ObjectDecoder.h
mv_machine_learning/inference/src/DecodeInfo.cpp
mv_machine_learning/inference/src/Inference.cpp
mv_machine_learning/inference/src/ObjectDecoder.cpp
mv_machine_learning/inference/src/OutputMetadata.cpp
test/testsuites/machine_learning/inference/test_object_detection.cpp

index 2f66423..f3b2afe 100644 (file)
@@ -57,8 +57,9 @@ struct AnchorParam {
        int offsetAnchors;
        inference_score_type_e type;
        std::map<std::string, inference_score_type_e> supportedCellType;
-       std::vector<std::vector<double> > vxScales;
-       std::vector<std::vector<double> > vyScales;
+       std::vector<double> vxScales;
+       std::vector<double> vyScales;
+       unsigned int totalAnchors;
 };
 
 struct NMSParam {
@@ -90,6 +91,7 @@ struct RoiOptionParam {
 struct DecodeInfo {
        AnchorParam anchorParam;
        std::vector<cv::Rect2f> anchorBoxes;
+       std::vector<std::vector<cv::Rect2f> > vAnchorBoxes; // (stride) * (H * W * B) * (rect)
        NMSParam nmsParam;
        RotateParam rotParam;
        RoiOptionParam roiOptParam;
@@ -127,6 +129,7 @@ struct DecodeInfo {
        // Anchor param
        int ParseAnchorParam(JsonObject *root);
        int GenerateAnchor();
+       int GenerateYOLOAnchor();
        bool IsFixedAnchorSize();
        bool IsExponentialBoxScale();
        float GetAnchorXscale();
index ee77bf6..68c3df7 100644 (file)
@@ -27,7 +27,7 @@ namespace inference
 struct DimInfo {
        std::vector<int> dims;
 
-       std::vector<int> GetValidIndexAll() const
+       const std::vector<int> &GetValidIndexAll() const
        {
                LOGI("ENTER");
 
index 50817d7..c9f37c3 100644 (file)
@@ -51,6 +51,9 @@ private:
        float decodeScore(int idx);
        Box decodeBox(int idx, float score, int label = -1, int offset = 0);
        Box decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::Rect2f &anchor);
+       Box decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx);
+       void decodeYOLO(BoxesList &boxList);
+       float decodeYOLOScore(int idx, int nameIdx);
 
 public:
        ObjectDecoder(TensorBuffer &buffer, OutputMetadata &metaData, int boxOffset, float scaleW, float scaleH,
index 5e4a831..046fa98 100644 (file)
@@ -92,8 +92,8 @@ int DecodeInfo::ParseAnchorParam(JsonObject *root)
                        xScale_.push_back(xScale);
                        yScale_.push_back(yScale);
                }
-               anchorParam.vxScales.push_back(xScale_);
-               anchorParam.vyScales.push_back(yScale_);
+               anchorParam.vxScales = xScale_;
+               anchorParam.vyScales = yScale_;
 
        } else {
                LOGE("Invalid anchor mode [%d]", anchorParam.mode);
@@ -350,3 +350,34 @@ int DecodeInfo::ParseRoiOption(JsonObject *root)
 
        return MEDIA_VISION_ERROR_NONE;
 }
+
+/**
+ * @ref https://wikidocs.net/163607
+ */
+int DecodeInfo::GenerateYOLOAnchor()
+{
+       constexpr int maxAnchorPerCell = 3;
+       LOGI("ENTER");
+       auto anchorIndex = vAnchorBoxes.size();
+       std::vector<cv::Rect2f> cal;
+       auto stride = anchorParam.strides[anchorIndex];
+       auto gridHeight = anchorParam.inputSizeHeight / stride;
+       auto gridWidth = anchorParam.inputSizeWidth / stride;
+
+       for (int y = 0; y < gridHeight; ++y) {
+               for (int x = 0; x < gridWidth; ++x) {
+                       for (int anchorPerCell = 0; anchorPerCell < maxAnchorPerCell; ++anchorPerCell) {
+                               cv::Rect2f anchor = { cv::Point2f { (static_cast<float>(x) + anchorParam.anchorOffsetX),
+                                                                                                       (static_cast<float>(y) + anchorParam.anchorOffsetY) },
+                                                                         cv::Size2f { anchorParam.vxScales[anchorPerCell] * static_cast<float>(stride),
+                                                                                                  anchorParam.vyScales[anchorPerCell] * static_cast<float>(stride) } };
+                               cal.push_back(anchor);
+                       }
+               }
+       }
+       anchorParam.totalAnchors += cal.size();
+       vAnchorBoxes.push_back(cal);
+
+       LOGI("LEAVE");
+       return MEDIA_VISION_ERROR_NONE;
+}
\ No newline at end of file
index a3917d5..320c116 100644 (file)
@@ -1187,23 +1187,27 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *results)
                                 outputMeta.GetScoreName().c_str());
                        return MEDIA_VISION_ERROR_INVALID_OPERATION;
                }
-
-               std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
-               if (boxIndexes.size() != 1) {
-                       LOGE("Invalid dim size. It should be 1");
-                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
-               }
-
-               int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+               int boxOffset;
                int numberOfObjects = 0;
-
-               if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
-                       std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
-                       if (scoreIndexes.size() != 1) {
+               if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+                       boxOffset = 255;
+               } else {
+                       std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
+                       if (boxIndexes.size() != 1) {
                                LOGE("Invalid dim size. It should be 1");
                                return MEDIA_VISION_ERROR_INVALID_OPERATION;
                        }
-                       numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+
+                       boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+
+                       if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+                               std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
+                               if (scoreIndexes.size() != 1) {
+                                       LOGE("Invalid dim size. It should be 1");
+                                       return MEDIA_VISION_ERROR_INVALID_OPERATION;
+                               }
+                               numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+                       }
                }
 
                ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
index fc62be4..4405a6f 100644 (file)
@@ -166,9 +166,9 @@ int ObjectDecoder::decode()
                        boxList.push_back(boxes);
                }
        }
-
-       if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR ||
-               mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+       if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+               decodeYOLO(boxList);
+       else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR)
                boxList.push_back(boxes);
 
        if (!boxList.empty()) {
@@ -192,5 +192,111 @@ Boxes &ObjectDecoder::getObjectAll()
 {
        return mResultBoxes;
 }
+
+float ObjectDecoder::decodeYOLOScore(int idx, int nameIdx)
+{
+       auto it = mMeta._tensor_info.begin();
+       std::advance(it, nameIdx);
+
+       float score = mTensorBuffer.getValue<float>(it->first, idx);
+       if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+               score = PostProcess::sigmoid(score);
+       }
+
+       return score;
+}
+Box ObjectDecoder::decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx)
+{
+       auto it = mMeta._tensor_info.begin();
+       std::advance(it, nameIdx);
+
+       // assume type is (cx,cy,w,h)
+       // left or cx
+       float cx = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]);
+       // top or cy
+       float cy = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]);
+       // right or width
+       float cWidth = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]);
+       // bottom or height
+       float cHeight = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]);
+
+       if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+               cx = PostProcess::sigmoid(cx);
+               cy = PostProcess::sigmoid(cy);
+               cWidth = PostProcess::sigmoid(cWidth);
+               cHeight = PostProcess::sigmoid(cHeight);
+       }
+
+       LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight);
+       // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP
+       if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) {
+               float tmpCx = cx;
+               float tmpCy = cy;
+               cx = (cx + cWidth) * 0.5f; // (left + right)/2
+               cy = (cy + cHeight) * 0.5f; // (top + bottom)/2
+               cWidth = cWidth - tmpCx; // right - left
+               cHeight = cHeight - tmpCy; // bottom - top
+       }
+
+       // convert coordinate to RATIO if PIXEL
+       if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) {
+               cx /= mScaleW;
+               cy /= mScaleH;
+               cWidth /= mScaleW;
+               cHeight /= mScaleH;
+       }
+
+       Box box = { .index = label, .score = score, .location = cv::Rect2f(cx, cy, cWidth, cHeight) };
+
+       return box;
+}
+void ObjectDecoder::decodeYOLO(BoxesList &boxesList)
+{
+       box::DecodeInfo &decodeInfo = mMeta.GetBoxDecodeInfo();
+       box::AnchorParam &yoloAnchor = decodeInfo.anchorParam;
+
+       //offsetAnchors is 3 which is number of BOX
+       mNumberOfOjects = mBoxOffset / yoloAnchor.offsetAnchors - 5;
+       boxesList.resize(mNumberOfOjects);
+
+       for (auto strideIdx = 0; strideIdx < yoloAnchor.offsetAnchors; strideIdx++) {
+               auto &stride = yoloAnchor.strides[strideIdx];
+               //for each stride
+               int startAnchorIdx = 0;
+               int endAnchorIdx = (static_cast<int>(mScaleW) / stride * static_cast<int>(mScaleH) / stride);
+
+               for (int anchorIdx = startAnchorIdx; anchorIdx < endAnchorIdx; anchorIdx++) {
+                       // for each grid cell
+                       for (int offset = 0; offset < yoloAnchor.offsetAnchors; ++offset) {
+                               //for each BOX
+                               //handle order is (H,W,A)
+                               float boxScore =
+                                               decodeYOLOScore(anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 4, strideIdx);
+
+                               auto anchorBox = decodeInfo.vAnchorBoxes[strideIdx][anchorIdx * yoloAnchor.offsetAnchors + offset];
+
+                               for (int objIdx = 0; objIdx < mNumberOfOjects; ++objIdx) { //each box to every object
+                                       float objScore = decodeYOLOScore(
+                                                       anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 5 + objIdx, strideIdx);
+
+                                       if (boxScore * objScore < mMeta.GetScoreThreshold())
+                                               continue;
+                                       Box box = decodeYOLOBox(anchorIdx, objScore, objIdx, (mNumberOfOjects + 5) * offset, strideIdx);
+
+                                       if (!decodeInfo.vAnchorBoxes.empty()) {
+                                               box.location.x = (box.location.x * 2 + anchorBox.x) * stride / mScaleW;
+                                               box.location.y = (box.location.y * 2 + anchorBox.y) * stride / mScaleH;
+                                               box.location.width =
+                                                               (box.location.width * 2) * (box.location.width * 2) * anchorBox.width / mScaleW;
+
+                                               box.location.height =
+                                                               (box.location.height * 2) * (box.location.height * 2) * anchorBox.height / mScaleH;
+                                       }
+                                       boxesList[objIdx].push_back(box);
+                               }
+                       }
+               }
+       }
+}
 }
 }
index 46db01e..7fd2e22 100644 (file)
@@ -135,6 +135,12 @@ int OutputMetadata::GetPostProcess(JsonObject *root, LayerInfo &layer)
                                LOGE("Fail to GenerateAnchor[%d]", ret);
                                return ret;
                        }
+               } else if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+                       ret = box.GetDecodeInfo().GenerateYOLOAnchor();
+                       if (ret != MEDIA_VISION_ERROR_NONE) {
+                               LOGE("Fail to GenerateAnchor[%d]", ret);
+                               return ret;
+                       }
                }
        }
 
index b3fac94..b6bc990 100644 (file)
@@ -43,6 +43,9 @@
 #define OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH \
        MV_CONFIG_PATH                                 \
        "/models/OD/snpe/yolov5s_quantize.dlc"
+#define OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH \
+       MV_CONFIG_PATH                                \
+       "/models/OD/snpe/label_coco_80.txt"
 
 void _object_detected_cb(mv_source_h source, const int number_of_objects, const int *indices, const char **names,
                                                 const float *confidences, const mv_rectangle_s *locations, void *user_data)
@@ -122,9 +125,8 @@ INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionTflite,
 class TestObjectDetectionSnpe : public TestInference
 {
 public:
-       void inferenceDog()
+       void inferenceDog(std::string &answer)
        {
-               std::string answer("dog");
                TestInference::ConfigureInference();
 
                ASSERT_EQ(MediaVision::Common::ImageHelper::loadImageToSource(IMG_DOG, mv_source), MEDIA_VISION_ERROR_NONE);
@@ -138,7 +140,8 @@ TEST_P(TestObjectDetectionSnpe, DISABLED_EFDLite2QC)
        engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_EFFCIENTDET_LITE2_448_PATH,
                                                                        OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type);
 
-       inferenceDog();
+       std::string answer("dog");
+       inferenceDog(answer);
 }
 
 TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor)
@@ -146,9 +149,10 @@ TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor)
 {
        ASSERT_TRUE(_use_json_parser);
        engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH,
-                                                                       OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type);
+                                                                       OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH, _use_json_parser, _target_device_type);
 
-       inferenceDog();
+       std::string answer("Dog");
+       inferenceDog(answer);
 }
 
 INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionSnpe,