2 * Copyright (c) 2021 Samsung Electronics Co., Ltd All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "mv_private.h"
18 #include "ObjectDecoder.h"
28 int ObjectDecoder::init()
30 if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_BYPASS) {
31 if (!mTensorBuffer.exist(mMeta.GetLabelName()) || !mTensorBuffer.exist(mMeta.GetNumberName())) {
32 LOGE("buffer buffers named of %s or %s are NULL", mMeta.GetLabelName().c_str(),
33 mMeta.GetNumberName().c_str());
35 return MEDIA_VISION_ERROR_INVALID_OPERATION;
38 std::vector<int> indexes = mMeta.GetNumberDimInfo().GetValidIndexAll();
39 if (indexes.size() != 1) {
40 LOGE("Invalid dim size. It should be 1");
41 return MEDIA_VISION_ERROR_INVALID_OPERATION;
44 // mNumberOfObjects is set again if INFERENCE_BOX_DECODING_TYPE_BYPASS.
45 // Otherwise it is set already within ctor.
46 mNumberOfOjects = mTensorBuffer.getValue<int>(mMeta.GetNumberName(), indexes[0]);
47 } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
48 if (mMeta.GetBoxDecodeInfo().IsAnchorBoxEmpty()) {
49 LOGE("Anchor boxes are required but empty.");
50 return MEDIA_VISION_ERROR_INVALID_OPERATION;
53 LOGI("YOLO_ANCHOR does nothing");
56 return MEDIA_VISION_ERROR_NONE;
59 float ObjectDecoder::decodeScore(int idx)
61 float score = mTensorBuffer.getValue<float>(mMeta.GetScoreName(), idx);
62 if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
63 score = PostProcess::sigmoid(score);
66 return score < mMeta.GetScoreThreshold() ? 0.0f : score;
69 Box ObjectDecoder::decodeBox(int idx, float score, int label, int offset)
71 // assume type is (cx,cy,w,h)
73 float cx = mTensorBuffer.getValue<float>(mMeta.GetBoxName(), idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]);
75 float cy = mTensorBuffer.getValue<float>(mMeta.GetBoxName(), idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]);
78 mTensorBuffer.getValue<float>(mMeta.GetBoxName(), idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]);
81 mTensorBuffer.getValue<float>(mMeta.GetBoxName(), idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]);
83 LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight);
84 // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP
85 if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) {
88 cx = (cx + cWidth) * 0.5f; // (left + right)/2
89 cy = (cy + cHeight) * 0.5f; // (top + bottom)/2
90 cWidth = cWidth - tmpCx; // right - left
91 cHeight = cHeight - tmpCy; // bottom - top
94 // convert coordinate to RATIO if PIXEL
95 if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) {
102 Box box = { .index = mMeta.GetLabelName().empty() ? label : mTensorBuffer.getValue<int>(mMeta.GetLabelName(), idx),
104 .location = cv::Rect2f(cx, cy, cWidth, cHeight) };
109 Box ObjectDecoder::decodeBoxWithAnchor(int idx, int anchorIdx, float score, cv::Rect2f &anchor)
111 // location coordinate of box, the output of decodeBox(), is relative between 0 ~ 1
112 Box box = decodeBox(anchorIdx, score, idx);
114 if (mMeta.GetBoxDecodeInfo().IsFixedAnchorSize()) {
115 box.location.x += anchor.x;
116 box.location.y += anchor.y;
118 box.location.x = box.location.x / mMeta.GetBoxDecodeInfo().GetAnchorXscale() * anchor.width + anchor.x;
119 box.location.y = box.location.y / mMeta.GetBoxDecodeInfo().GetAnchorYscale() * anchor.height + anchor.y;
122 if (mMeta.GetBoxDecodeInfo().IsExponentialBoxScale()) {
123 box.location.width = anchor.width * std::exp(box.location.width / mMeta.GetBoxDecodeInfo().GetAnchorWscale());
124 box.location.height =
125 anchor.height * std::exp(box.location.height / mMeta.GetBoxDecodeInfo().GetAnchorHscale());
127 box.location.width = anchor.width * box.location.width / mMeta.GetBoxDecodeInfo().GetAnchorWscale();
128 box.location.height = anchor.height * box.location.height / mMeta.GetBoxDecodeInfo().GetAnchorHscale();
134 int ObjectDecoder::decode()
140 int ret = MEDIA_VISION_ERROR_NONE;
141 int totalIdx = mNumberOfOjects;
143 for (int idx = 0; idx < totalIdx; ++idx) {
144 if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_BYPASS) {
145 float score = decodeScore(idx);
149 Box box = decodeBox(idx, score);
150 mResultBoxes.push_back(box);
151 } else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR) {
155 for (auto &anchorBox : mMeta.GetBoxDecodeInfo().GetAnchorBoxAll()) {
158 float score = decodeScore(anchorIdx * mNumberOfOjects + idx);
163 Box box = decodeBoxWithAnchor(idx, anchorIdx, score, anchorBox);
164 boxes.push_back(box);
166 boxList.push_back(boxes);
169 if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_YOLO_ANCHOR)
171 else if (mMeta.GetBoxDecodingType() == INFERENCE_BOX_DECODING_TYPE_SSD_ANCHOR)
172 boxList.push_back(boxes);
174 if (!boxList.empty()) {
175 PostProcess postProc;
176 ret = postProc.Nms(boxList, mMeta.GetBoxDecodeInfo().GetNmsMode(),
177 mMeta.GetBoxDecodeInfo().GetNmsIouThreshold(), mResultBoxes);
178 if (ret != MEDIA_VISION_ERROR_NONE) {
179 LOGE("Fail to non-maximum suppression[%d]", ret);
183 LOGW("boxlist empty!");
191 Boxes &ObjectDecoder::getObjectAll()
196 float ObjectDecoder::decodeYOLOScore(int idx, int nameIdx)
198 auto it = mMeta._tensor_info.begin();
199 std::advance(it, nameIdx);
201 float score = mTensorBuffer.getValue<float>(it->first, idx);
202 if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
203 score = PostProcess::sigmoid(score);
208 Box ObjectDecoder::decodeYOLOBox(int idx, float score, int label, int offset, int nameIdx)
210 auto it = mMeta._tensor_info.begin();
211 std::advance(it, nameIdx);
213 // assume type is (cx,cy,w,h)
215 float cx = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[0]);
217 float cy = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[1]);
219 float cWidth = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[2]);
221 float cHeight = mTensorBuffer.getValue<float>(it->first, idx * mBoxOffset + offset + mMeta.GetBoxOrder()[3]);
223 if (mMeta.GetScoreType() == INFERENCE_SCORE_TYPE_SIGMOID) {
224 cx = PostProcess::sigmoid(cx);
225 cy = PostProcess::sigmoid(cy);
226 cWidth = PostProcess::sigmoid(cWidth);
227 cHeight = PostProcess::sigmoid(cHeight);
230 LOGI("cx:%.2f, cy:%.2f, cW:%.2f, cH:%.2f", cx, cy, cWidth, cHeight);
231 // convert type to ORIGIN_CENTER if ORIGIN_LEFTTOP
232 if (mMeta.GetBoxType() == INFERENCE_BOX_TYPE_ORIGIN_LEFTTOP) {
235 cx = (cx + cWidth) * 0.5f; // (left + right)/2
236 cy = (cy + cHeight) * 0.5f; // (top + bottom)/2
237 cWidth = cWidth - tmpCx; // right - left
238 cHeight = cHeight - tmpCy; // bottom - top
241 // convert coordinate to RATIO if PIXEL
242 if (mMeta.GetScoreCoordinate() == INFERENCE_BOX_COORDINATE_TYPE_PIXEL) {
249 Box box = { .index = label, .score = score, .location = cv::Rect2f(cx, cy, cWidth, cHeight) };
253 void ObjectDecoder::decodeYOLO(BoxesList &boxesList)
255 box::DecodeInfo &decodeInfo = mMeta.GetBoxDecodeInfo();
256 box::AnchorParam &yoloAnchor = decodeInfo.anchorParam;
258 //offsetAnchors is 3 which is number of BOX
259 mNumberOfOjects = mBoxOffset / yoloAnchor.offsetAnchors - 5;
260 boxesList.resize(mNumberOfOjects);
262 for (auto strideIdx = 0; strideIdx < yoloAnchor.offsetAnchors; strideIdx++) {
263 auto &stride = yoloAnchor.strides[strideIdx];
265 int startAnchorIdx = 0;
266 int endAnchorIdx = (static_cast<int>(mScaleW) / stride * static_cast<int>(mScaleH) / stride);
268 for (int anchorIdx = startAnchorIdx; anchorIdx < endAnchorIdx; anchorIdx++) {
269 // for each grid cell
270 for (int offset = 0; offset < yoloAnchor.offsetAnchors; ++offset) {
272 //handle order is (H,W,A)
274 decodeYOLOScore(anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 4, strideIdx);
276 auto anchorBox = decodeInfo.vAnchorBoxes[strideIdx][anchorIdx * yoloAnchor.offsetAnchors + offset];
278 for (int objIdx = 0; objIdx < mNumberOfOjects; ++objIdx) { //each box to every object
279 float objScore = decodeYOLOScore(
280 anchorIdx * mBoxOffset + (mNumberOfOjects + 5) * offset + 5 + objIdx, strideIdx);
282 if (boxScore * objScore < mMeta.GetScoreThreshold())
284 Box box = decodeYOLOBox(anchorIdx, objScore, objIdx, (mNumberOfOjects + 5) * offset, strideIdx);
286 if (!decodeInfo.vAnchorBoxes.empty()) {
287 box.location.x = (box.location.x * 2 + anchorBox.x) * stride / mScaleW;
288 box.location.y = (box.location.y * 2 + anchorBox.y) * stride / mScaleH;
290 (box.location.width * 2) * (box.location.width * 2) * anchorBox.width / mScaleW;
292 box.location.height =
293 (box.location.height * 2) * (box.location.height * 2) * anchorBox.height / mScaleH;
295 boxesList[objIdx].push_back(box);