From: Kwanghoon Son Date: Fri, 18 Nov 2022 05:48:56 +0000 (-0500) Subject: Add yolo5s model on SNPE X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=221809a2f73b6e44d63bc422a89a9204f872a24c;p=platform%2Fcore%2Fapi%2Fmediavision.git Add yolo5s model on SNPE [Issue type] new Change-Id: If3209125970b1e8e2cce9cda9000f89c6617ce2b Signed-off-by: Kwanghoon Son --- diff --git a/mv_machine_learning/inference/include/DecodeInfo.h b/mv_machine_learning/inference/include/DecodeInfo.h index 2f664238..f3b2afe1 100644 --- a/mv_machine_learning/inference/include/DecodeInfo.h +++ b/mv_machine_learning/inference/include/DecodeInfo.h @@ -57,8 +57,9 @@ struct AnchorParam { int offsetAnchors; inference_score_type_e type; std::map supportedCellType; - std::vector > vxScales; - std::vector > vyScales; + std::vector vxScales; + std::vector vyScales; + unsigned int totalAnchors; }; struct NMSParam { @@ -90,6 +91,7 @@ struct RoiOptionParam { struct DecodeInfo { AnchorParam anchorParam; std::vector anchorBoxes; + std::vector > vAnchorBoxes; // (stride) * (H * W * B) * (rect) NMSParam nmsParam; RotateParam rotParam; RoiOptionParam roiOptParam; @@ -127,6 +129,7 @@ struct DecodeInfo { // Anchor param int ParseAnchorParam(JsonObject *root); int GenerateAnchor(); + int GenerateYOLOAnchor(); bool IsFixedAnchorSize(); bool IsExponentialBoxScale(); float GetAnchorXscale(); diff --git a/mv_machine_learning/inference/include/DimInfo.h b/mv_machine_learning/inference/include/DimInfo.h index ee77bf65..68c3df77 100644 --- a/mv_machine_learning/inference/include/DimInfo.h +++ b/mv_machine_learning/inference/include/DimInfo.h @@ -27,7 +27,7 @@ namespace inference struct DimInfo { std::vector dims; - std::vector GetValidIndexAll() const + const std::vector &GetValidIndexAll() const { LOGI("ENTER"); diff --git a/mv_machine_learning/inference/include/ObjectDecoder.h b/mv_machine_learning/inference/include/ObjectDecoder.h index 50817d77..c9f37c3c 100644 --- a/mv_machine_learning/inference/include/ObjectDecoder.h +++ b/mv_machine_learning/inference/include/ObjectDecoder.h @@ -51,6 +51,9 @@ private: float decodeScore(int idx); Box decodeBox(int idx, float score, int label = -1, int offset = 0); Box decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::Rect2f &anchor); + Box decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx); + void decodeYOLO(BoxesList &boxList); + float decodeYOLOScore(int idx, int nameIdx); public: ObjectDecoder(TensorBuffer &buffer, OutputMetadata &metaData, int boxOffset, float scaleW, float scaleH, diff --git a/mv_machine_learning/inference/src/DecodeInfo.cpp b/mv_machine_learning/inference/src/DecodeInfo.cpp index 5e4a831c..046fa98a 100644 --- a/mv_machine_learning/inference/src/DecodeInfo.cpp +++ b/mv_machine_learning/inference/src/DecodeInfo.cpp @@ -92,8 +92,8 @@ int DecodeInfo::ParseAnchorParam(JsonObject *root) xScale_.push_back(xScale); yScale_.push_back(yScale); } - anchorParam.vxScales.push_back(xScale_); - anchorParam.vyScales.push_back(yScale_); + anchorParam.vxScales = xScale_; + anchorParam.vyScales = yScale_; } else { LOGE("Invalid anchor mode [%d]", anchorParam.mode); @@ -350,3 +350,34 @@ int DecodeInfo::ParseRoiOption(JsonObject *root) return MEDIA_VISION_ERROR_NONE; } + +/** + * @ref https://wikidocs.net/163607 + */ +int DecodeInfo::GenerateYOLOAnchor() +{ + constexpr int maxAnchorPerCell = 3; + LOGI("ENTER"); + auto anchorIndex = vAnchorBoxes.size(); + std::vector cal; + auto stride = anchorParam.strides[anchorIndex]; + auto gridHeight = anchorParam.inputSizeHeight / stride; + auto gridWidth = anchorParam.inputSizeWidth / stride; + + for (int y = 0; y < gridHeight; ++y) { + for (int x = 0; x < gridWidth; ++x) { + for (int anchorPerCell = 0; anchorPerCell < maxAnchorPerCell; ++anchorPerCell) { + cv::Rect2f anchor = { cv::Point2f { (static_cast(x) + anchorParam.anchorOffsetX), + (static_cast(y) + anchorParam.anchorOffsetY) }, + cv::Size2f { anchorParam.vxScales[anchorPerCell] * static_cast(stride), + anchorParam.vyScales[anchorPerCell] * static_cast(stride) } }; + cal.push_back(anchor); + } + } + } + anchorParam.totalAnchors += cal.size(); + vAnchorBoxes.push_back(cal); + + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; +} \ No newline at end of file diff --git a/mv_machine_learning/inference/src/Inference.cpp b/mv_machine_learning/inference/src/Inference.cpp index a3917d5b..320c1160 100644 --- a/mv_machine_learning/inference/src/Inference.cpp +++ b/mv_machine_learning/inference/src/Inference.cpp @@ -1187,23 +1187,27 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *results) outputMeta.GetScoreName().c_str()); return MEDIA_VISION_ERROR_INVALID_OPERATION; } - - std::vector boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll(); - if (boxIndexes.size() != 1) { - LOGE("Invalid dim size. It should be 1"); - return MEDIA_VISION_ERROR_INVALID_OPERATION; - } - - int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]]; + int boxOffset; int numberOfObjects = 0; - - if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) { - std::vector scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll(); - if (scoreIndexes.size() != 1) { + if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) { + boxOffset = 255; + } else { + std::vector boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll(); + if (boxIndexes.size() != 1) { LOGE("Invalid dim size. It should be 1"); return MEDIA_VISION_ERROR_INVALID_OPERATION; } - numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]]; + + boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]]; + + if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) { + std::vector scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll(); + if (scoreIndexes.size() != 1) { + LOGE("Invalid dim size. It should be 1"); + return MEDIA_VISION_ERROR_INVALID_OPERATION; + } + numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]]; + } } ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset, diff --git a/mv_machine_learning/inference/src/ObjectDecoder.cpp b/mv_machine_learning/inference/src/ObjectDecoder.cpp index fc62be47..4405a6fd 100644 --- a/mv_machine_learning/inference/src/ObjectDecoder.cpp +++ b/mv_machine_learning/inference/src/ObjectDecoder.cpp @@ -166,9 +166,9 @@ int ObjectDecoder::decode() boxList.push_back(boxes); } } - - if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR || - mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) + if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) + decodeYOLO(boxList); + else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) boxList.push_back(boxes); if (!boxList.empty()) { @@ -192,5 +192,111 @@ Boxes &ObjectDecoder::getObjectAll() { return mResultBoxes; } + +float ObjectDecoder::decodeYOLOScore(int idx, int nameIdx) +{ + auto it = mMeta._tensor_info.begin(); + std::advance(it, nameIdx); + + float score = mTensorBuffer.getValue(it->first, idx); + if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) { + score = PostProcess::sigmoid(score); + } + + return score; +} +Box ObjectDecoder::decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx) +{ + auto it = mMeta._tensor_info.begin(); + std::advance(it, nameIdx); + + // assume type is (cx,cy,w,h) + // left or cx + float cx = mTensorBuffer.getValue(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]); + // top or cy + float cy = mTensorBuffer.getValue(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]); + // right or width + float cWidth = mTensorBuffer.getValue(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]); + // bottom or height + float cHeight = mTensorBuffer.getValue(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]); + + if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) { + cx = PostProcess::sigmoid(cx); + cy = PostProcess::sigmoid(cy); + cWidth = PostProcess::sigmoid(cWidth); + cHeight = PostProcess::sigmoid(cHeight); + } + + LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight); + // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP + if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) { + float tmpCx = cx; + float tmpCy = cy; + cx = (cx + cWidth) * 0.5f; // (left + right)/2 + cy = (cy + cHeight) * 0.5f; // (top + bottom)/2 + cWidth = cWidth - tmpCx; // right - left + cHeight = cHeight - tmpCy; // bottom - top + } + + // convert coordinate to RATIO if PIXEL + if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) { + cx /= mScaleW; + cy /= mScaleH; + cWidth /= mScaleW; + cHeight /= mScaleH; + } + + Box box = { .index = label, .score = score, .location = cv::Rect2f(cx, cy, cWidth, cHeight) }; + + return box; +} +void ObjectDecoder::decodeYOLO(BoxesList &boxesList) +{ + box::DecodeInfo &decodeInfo = mMeta.GetBoxDecodeInfo(); + box::AnchorParam &yoloAnchor = decodeInfo.anchorParam; + + //offsetAnchors is 3 which is number of BOX + mNumberOfOjects = mBoxOffset / yoloAnchor.offsetAnchors - 5; + boxesList.resize(mNumberOfOjects); + + for (auto strideIdx = 0; strideIdx < yoloAnchor.offsetAnchors; strideIdx++) { + auto &stride = yoloAnchor.strides[strideIdx]; + //for each stride + int startAnchorIdx = 0; + int endAnchorIdx = (static_cast(mScaleW) / stride * static_cast(mScaleH) / stride); + + for (int anchorIdx = startAnchorIdx; anchorIdx < endAnchorIdx; anchorIdx++) { + // for each grid cell + for (int offset = 0; offset < yoloAnchor.offsetAnchors; ++offset) { + //for each BOX + //handle order is (H,W,A) + float boxScore = + decodeYOLOScore(anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 4, strideIdx); + + auto anchorBox = decodeInfo.vAnchorBoxes[strideIdx][anchorIdx * yoloAnchor.offsetAnchors + offset]; + + for (int objIdx = 0; objIdx < mNumberOfOjects; ++objIdx) { //each box to every object + float objScore = decodeYOLOScore( + anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 5 + objIdx, strideIdx); + + if (boxScore * objScore < mMeta.GetScoreThreshold()) + continue; + Box box = decodeYOLOBox(anchorIdx, objScore, objIdx, (mNumberOfOjects + 5) * offset, strideIdx); + + if (!decodeInfo.vAnchorBoxes.empty()) { + box.location.x = (box.location.x * 2 + anchorBox.x) * stride / mScaleW; + box.location.y = (box.location.y * 2 + anchorBox.y) * stride / mScaleH; + box.location.width = + (box.location.width * 2) * (box.location.width * 2) * anchorBox.width / mScaleW; + + box.location.height = + (box.location.height * 2) * (box.location.height * 2) * anchorBox.height / mScaleH; + } + boxesList[objIdx].push_back(box); + } + } + } + } +} } } diff --git a/mv_machine_learning/inference/src/OutputMetadata.cpp b/mv_machine_learning/inference/src/OutputMetadata.cpp index 46db01e2..7fd2e22e 100644 --- a/mv_machine_learning/inference/src/OutputMetadata.cpp +++ b/mv_machine_learning/inference/src/OutputMetadata.cpp @@ -135,6 +135,12 @@ int OutputMetadata::GetPostProcess(JsonObject *root, LayerInfo &layer) LOGE("Fail to GenerateAnchor[%d]", ret); return ret; } + } else if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) { + ret = box.GetDecodeInfo().GenerateYOLOAnchor(); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to GenerateAnchor[%d]", ret); + return ret; + } } } diff --git a/test/testsuites/machine_learning/inference/test_object_detection.cpp b/test/testsuites/machine_learning/inference/test_object_detection.cpp index b3fac94d..b6bc9902 100644 --- a/test/testsuites/machine_learning/inference/test_object_detection.cpp +++ b/test/testsuites/machine_learning/inference/test_object_detection.cpp @@ -43,6 +43,9 @@ #define OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH \ MV_CONFIG_PATH \ "/models/OD/snpe/yolov5s_quantize.dlc" +#define OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH \ + MV_CONFIG_PATH \ + "/models/OD/snpe/label_coco_80.txt" void _object_detected_cb(mv_source_h source, const int number_of_objects, const int *indices, const char **names, const float *confidences, const mv_rectangle_s *locations, void *user_data) @@ -122,9 +125,8 @@ INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionTflite, class TestObjectDetectionSnpe : public TestInference { public: - void inferenceDog() + void inferenceDog(std::string &answer) { - std::string answer("dog"); TestInference::ConfigureInference(); ASSERT_EQ(MediaVision::Common::ImageHelper::loadImageToSource(IMG_DOG, mv_source), MEDIA_VISION_ERROR_NONE); @@ -138,7 +140,8 @@ TEST_P(TestObjectDetectionSnpe, DISABLED_EFDLite2QC) engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_EFFCIENTDET_LITE2_448_PATH, OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type); - inferenceDog(); + std::string answer("dog"); + inferenceDog(answer); } TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor) @@ -146,9 +149,10 @@ TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor) { ASSERT_TRUE(_use_json_parser); engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH, - OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type); + OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH, _use_json_parser, _target_device_type); - inferenceDog(); + std::string answer("Dog"); + inferenceDog(answer); } INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionSnpe,