int offsetAnchors;
inference_score_type_e type;
std::map<std::string, inference_score_type_e> supportedCellType;
- std::vector<std::vector<double> > vxScales;
- std::vector<std::vector<double> > vyScales;
+ std::vector<double> vxScales;
+ std::vector<double> vyScales;
+ unsigned int totalAnchors;
};
struct NMSParam {
struct DecodeInfo {
AnchorParam anchorParam;
std::vector<cv::Rect2f> anchorBoxes;
+ std::vector<std::vector<cv::Rect2f> > vAnchorBoxes; // (stride) * (H * W * B) * (rect)
NMSParam nmsParam;
RotateParam rotParam;
RoiOptionParam roiOptParam;
// Anchor param
int ParseAnchorParam(JsonObject *root);
int GenerateAnchor();
+ int GenerateYOLOAnchor();
bool IsFixedAnchorSize();
bool IsExponentialBoxScale();
float GetAnchorXscale();
struct DimInfo {
std::vector<int> dims;
- std::vector<int> GetValidIndexAll() const
+ const std::vector<int> &GetValidIndexAll() const
{
LOGI("ENTER");
float decodeScore(int idx);
Box decodeBox(int idx, float score, int label = -1, int offset = 0);
Box decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::Rect2f &anchor);
+ Box decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx);
+ void decodeYOLO(BoxesList &boxList);
+ float decodeYOLOScore(int idx, int nameIdx);
public:
ObjectDecoder(TensorBuffer &buffer, OutputMetadata &metaData, int boxOffset, float scaleW, float scaleH,
xScale_.push_back(xScale);
yScale_.push_back(yScale);
}
- anchorParam.vxScales.push_back(xScale_);
- anchorParam.vyScales.push_back(yScale_);
+ anchorParam.vxScales = xScale_;
+ anchorParam.vyScales = yScale_;
} else {
LOGE("Invalid anchor mode [%d]", anchorParam.mode);
return MEDIA_VISION_ERROR_NONE;
}
+
+/**
+ * @ref https://wikidocs.net/163607
+ */
+int DecodeInfo::GenerateYOLOAnchor()
+{
+ constexpr int maxAnchorPerCell = 3;
+ LOGI("ENTER");
+ auto anchorIndex = vAnchorBoxes.size();
+ std::vector<cv::Rect2f> cal;
+ auto stride = anchorParam.strides[anchorIndex];
+ auto gridHeight = anchorParam.inputSizeHeight / stride;
+ auto gridWidth = anchorParam.inputSizeWidth / stride;
+
+ for (int y = 0; y < gridHeight; ++y) {
+ for (int x = 0; x < gridWidth; ++x) {
+ for (int anchorPerCell = 0; anchorPerCell < maxAnchorPerCell; ++anchorPerCell) {
+ cv::Rect2f anchor = { cv::Point2f { (static_cast<float>(x) + anchorParam.anchorOffsetX),
+ (static_cast<float>(y) + anchorParam.anchorOffsetY) },
+ cv::Size2f { anchorParam.vxScales[anchorPerCell] * static_cast<float>(stride),
+ anchorParam.vyScales[anchorPerCell] * static_cast<float>(stride) } };
+ cal.push_back(anchor);
+ }
+ }
+ }
+ anchorParam.totalAnchors += cal.size();
+ vAnchorBoxes.push_back(cal);
+
+ LOGI("LEAVE");
+ return MEDIA_VISION_ERROR_NONE;
+}
\ No newline at end of file
outputMeta.GetScoreName().c_str());
return MEDIA_VISION_ERROR_INVALID_OPERATION;
}
-
- std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
- if (boxIndexes.size() != 1) {
- LOGE("Invalid dim size. It should be 1");
- return MEDIA_VISION_ERROR_INVALID_OPERATION;
- }
-
- int boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+ int boxOffset;
int numberOfObjects = 0;
-
- if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
- std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
- if (scoreIndexes.size() != 1) {
+ if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+ boxOffset = 255;
+ } else {
+ std::vector<int> boxIndexes = outputMeta.GetBoxDimInfo().GetValidIndexAll();
+ if (boxIndexes.size() != 1) {
LOGE("Invalid dim size. It should be 1");
return MEDIA_VISION_ERROR_INVALID_OPERATION;
}
- numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+
+ boxOffset = mOutputLayerProperty.layers[outputMeta.GetBoxName()].shape[boxIndexes[0]];
+
+ if (outputMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
+ std::vector<int> scoreIndexes = outputMeta.GetScoreDimInfo().GetValidIndexAll();
+ if (scoreIndexes.size() != 1) {
+ LOGE("Invalid dim size. It should be 1");
+ return MEDIA_VISION_ERROR_INVALID_OPERATION;
+ }
+ numberOfObjects = mOutputLayerProperty.layers[outputMeta.GetScoreName()].shape[scoreIndexes[0]];
+ }
}
ObjectDecoder objDecoder(mOutputTensorBuffers, outputMeta, boxOffset,
boxList.push_back(boxes);
}
}
-
- if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR ||
- mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+ if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
+ decodeYOLO(boxList);
+ else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR)
boxList.push_back(boxes);
if (!boxList.empty()) {
{
return mResultBoxes;
}
+
+float ObjectDecoder::decodeYOLOScore(int idx, int nameIdx)
+{
+ auto it = mMeta._tensor_info.begin();
+ std::advance(it, nameIdx);
+
+ float score = mTensorBuffer.getValue<float>(it->first, idx);
+ if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+ score = PostProcess::sigmoid(score);
+ }
+
+ return score;
+}
+Box ObjectDecoder::decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx)
+{
+ auto it = mMeta._tensor_info.begin();
+ std::advance(it, nameIdx);
+
+ // assume type is (cx,cy,w,h)
+ // left or cx
+ float cx = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]);
+ // top or cy
+ float cy = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]);
+ // right or width
+ float cWidth = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]);
+ // bottom or height
+ float cHeight = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]);
+
+ if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
+ cx = PostProcess::sigmoid(cx);
+ cy = PostProcess::sigmoid(cy);
+ cWidth = PostProcess::sigmoid(cWidth);
+ cHeight = PostProcess::sigmoid(cHeight);
+ }
+
+ LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight);
+ // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP
+ if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) {
+ float tmpCx = cx;
+ float tmpCy = cy;
+ cx = (cx + cWidth) * 0.5f; // (left + right)/2
+ cy = (cy + cHeight) * 0.5f; // (top + bottom)/2
+ cWidth = cWidth - tmpCx; // right - left
+ cHeight = cHeight - tmpCy; // bottom - top
+ }
+
+ // convert coordinate to RATIO if PIXEL
+ if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) {
+ cx /= mScaleW;
+ cy /= mScaleH;
+ cWidth /= mScaleW;
+ cHeight /= mScaleH;
+ }
+
+ Box box = { .index = label, .score = score, .location = cv::Rect2f(cx, cy, cWidth, cHeight) };
+
+ return box;
+}
+void ObjectDecoder::decodeYOLO(BoxesList &boxesList)
+{
+ box::DecodeInfo &decodeInfo = mMeta.GetBoxDecodeInfo();
+ box::AnchorParam &yoloAnchor = decodeInfo.anchorParam;
+
+ //offsetAnchors is 3 which is number of BOX
+ mNumberOfOjects = mBoxOffset / yoloAnchor.offsetAnchors - 5;
+ boxesList.resize(mNumberOfOjects);
+
+ for (auto strideIdx = 0; strideIdx < yoloAnchor.offsetAnchors; strideIdx++) {
+ auto &stride = yoloAnchor.strides[strideIdx];
+ //for each stride
+ int startAnchorIdx = 0;
+ int endAnchorIdx = (static_cast<int>(mScaleW) / stride * static_cast<int>(mScaleH) / stride);
+
+ for (int anchorIdx = startAnchorIdx; anchorIdx < endAnchorIdx; anchorIdx++) {
+ // for each grid cell
+ for (int offset = 0; offset < yoloAnchor.offsetAnchors; ++offset) {
+ //for each BOX
+ //handle order is (H,W,A)
+ float boxScore =
+ decodeYOLOScore(anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 4, strideIdx);
+
+ auto anchorBox = decodeInfo.vAnchorBoxes[strideIdx][anchorIdx * yoloAnchor.offsetAnchors + offset];
+
+ for (int objIdx = 0; objIdx < mNumberOfOjects; ++objIdx) { //each box to every object
+ float objScore = decodeYOLOScore(
+ anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 5 + objIdx, strideIdx);
+
+ if (boxScore * objScore < mMeta.GetScoreThreshold())
+ continue;
+ Box box = decodeYOLOBox(anchorIdx, objScore, objIdx, (mNumberOfOjects + 5) * offset, strideIdx);
+
+ if (!decodeInfo.vAnchorBoxes.empty()) {
+ box.location.x = (box.location.x * 2 + anchorBox.x) * stride / mScaleW;
+ box.location.y = (box.location.y * 2 + anchorBox.y) * stride / mScaleH;
+ box.location.width =
+ (box.location.width * 2) * (box.location.width * 2) * anchorBox.width / mScaleW;
+
+ box.location.height =
+ (box.location.height * 2) * (box.location.height * 2) * anchorBox.height / mScaleH;
+ }
+ boxesList[objIdx].push_back(box);
+ }
+ }
+ }
+ }
+}
}
}
LOGE("Fail to GenerateAnchor[%d]", ret);
return ret;
}
+ } else if (box.GetDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR) {
+ ret = box.GetDecodeInfo().GenerateYOLOAnchor();
+ if (ret != MEDIA_VISION_ERROR_NONE) {
+ LOGE("Fail to GenerateAnchor[%d]", ret);
+ return ret;
+ }
}
}
#define OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH \
MV_CONFIG_PATH \
"/models/OD/snpe/yolov5s_quantize.dlc"
+#define OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH \
+ MV_CONFIG_PATH \
+ "/models/OD/snpe/label_coco_80.txt"
void _object_detected_cb(mv_source_h source, const int number_of_objects, const int *indices, const char **names,
const float *confidences, const mv_rectangle_s *locations, void *user_data)
class TestObjectDetectionSnpe : public TestInference
{
public:
- void inferenceDog()
+ void inferenceDog(std::string &answer)
{
- std::string answer("dog");
TestInference::ConfigureInference();
ASSERT_EQ(MediaVision::Common::ImageHelper::loadImageToSource(IMG_DOG, mv_source), MEDIA_VISION_ERROR_NONE);
engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_EFFCIENTDET_LITE2_448_PATH,
OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type);
- inferenceDog();
+ std::string answer("dog");
+ inferenceDog(answer);
}
TEST_P(TestObjectDetectionSnpe, DISABLED_YoloV5MultiAnchor)
{
ASSERT_TRUE(_use_json_parser);
engine_config_hosted_snpe_model(engine_cfg, OD_SNPE_WEIGHT_QC_YOLO_V5_MULTIANCHOR_PATH,
- OD_LABEL_EFFICIENTDET_LITE2_448_PATH, _use_json_parser, _target_device_type);
+ OD_SNPE_LABEL_QC_YOLO_V5_MULTIANCHOR_PATH, _use_json_parser, _target_device_type);
- inferenceDog();
+ std::string answer("Dog");
+ inferenceDog(answer);
}
INSTANTIATE_TEST_CASE_P(Prefix, TestObjectDetectionSnpe,