Add yolo5s model on SNPE
[platform/core/api/mediavision.git] / mv_machine_learning / inference / src / DecodeInfo.cpp
1 /**
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <DecodeInfo.h>
18 #include <Utils.h>
19
20 using namespace mediavision::inference;
21 using namespace mediavision::inference::box;
22
23 void DecodeInfo::AddAnchorBox(cv::Rect2f &anchor)
24 {
25         anchorBoxes.push_back(anchor);
26 }
27
28 void DecodeInfo::ClearAnchorBox()
29 {
30         anchorBoxes.clear();
31 }
32
33 std::vector<cv::Rect2f> &DecodeInfo::GetAnchorBoxAll()
34 {
35         return anchorBoxes;
36 }
37
38 bool DecodeInfo::IsAnchorBoxEmpty()
39 {
40         return anchorBoxes.empty();
41 }
42
43 int DecodeInfo::ParseAnchorParam(JsonObject *root)
44 {
45         JsonObject *object = json_object_get_object_member(root, "anchor");
46
47         anchorParam.mode = static_cast<int>(json_object_get_int_member(object, "mode"));
48
49         if (anchorParam.mode == 0) { // SSD
50                 anchorParam.numLayers = static_cast<int>(json_object_get_int_member(object, "num_layers"));
51
52                 anchorParam.minScale = static_cast<float>(json_object_get_double_member(object, "min_scale"));
53                 anchorParam.maxScale = static_cast<float>(json_object_get_double_member(object, "max_scale"));
54
55                 anchorParam.isReduceBoxedInLowestLayer =
56                                 static_cast<bool>(json_object_get_boolean_member(object, "reduce_boxed_in_lowest_layer"));
57                 anchorParam.interpolatedScaleAspectRatio =
58                                 static_cast<float>(json_object_get_double_member(object, "interpolated_scale_aspect_ratio"));
59                 anchorParam.isFixedAnchorSize = static_cast<bool>(json_object_get_boolean_member(object, "fixed_anchor_size"));
60                 anchorParam.isExponentialBoxScale =
61                                 static_cast<bool>(json_object_get_boolean_member(object, "exponential_box_scale"));
62
63                 anchorParam.xScale = static_cast<float>(json_object_get_double_member(object, "x_scale"));
64                 anchorParam.yScale = static_cast<float>(json_object_get_double_member(object, "y_scale"));
65                 anchorParam.wScale = static_cast<float>(json_object_get_double_member(object, "w_scale"));
66                 anchorParam.hScale = static_cast<float>(json_object_get_double_member(object, "h_scale"));
67
68                 JsonArray *array = json_object_get_array_member(object, "aspect_ratios");
69                 auto elements = json_array_get_length(array);
70                 for (unsigned int elem2 = 0; elem2 < elements; ++elem2) {
71                         auto aspectRatio = static_cast<float>(json_array_get_double_element(array, elem2));
72                         anchorParam.aspectRatios.push_back(aspectRatio);
73                         LOGI("aspectRatio: %.4f", aspectRatio);
74                 }
75         } else if (anchorParam.mode == 1) { // Yolo
76                 anchorParam.offsetAnchors = static_cast<int>(json_object_get_int_member(object, "offset_anchors"));
77                 JsonArray *xScales = json_object_get_array_member(object, "x_scales");
78                 JsonArray *yScales = json_object_get_array_member(object, "y_scales");
79                 unsigned int xElements2 = json_array_get_length(xScales);
80                 unsigned int yElements2 = json_array_get_length(yScales);
81                 if (xElements2 != yElements2) {
82                         LOGE("Invalid x and y scales. They should be the same size");
83                         return MEDIA_VISION_ERROR_INVALID_OPERATION;
84                 }
85
86                 std::vector<double> xScale_;
87                 std::vector<double> yScale_;
88                 for (unsigned int arrayElem2 = 0; arrayElem2 < xElements2; ++arrayElem2) {
89                         auto xScale = static_cast<double>(json_array_get_double_element(xScales, arrayElem2));
90                         auto yScale = static_cast<double>(json_array_get_double_element(yScales, arrayElem2));
91                         LOGI("xScale:%lf, yScale:%lf", xScale, yScale);
92                         xScale_.push_back(xScale);
93                         yScale_.push_back(yScale);
94                 }
95                 anchorParam.vxScales = xScale_;
96                 anchorParam.vyScales = yScale_;
97
98         } else {
99                 LOGE("Invalid anchor mode [%d]", anchorParam.mode);
100                 return MEDIA_VISION_ERROR_INVALID_PARAMETER;
101         }
102
103         anchorParam.inputSizeHeight = static_cast<int>(json_object_get_int_member(object, "input_size_height"));
104         anchorParam.inputSizeWidth = static_cast<int>(json_object_get_int_member(object, "input_size_width"));
105         anchorParam.anchorOffsetX = static_cast<float>(json_object_get_double_member(object, "anchor_offset_x"));
106         anchorParam.anchorOffsetY = static_cast<float>(json_object_get_double_member(object, "anchor_offset_y"));
107
108         JsonArray *array = json_object_get_array_member(object, "strides");
109         unsigned int elements2 = json_array_get_length(array);
110         for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) {
111                 auto stride = static_cast<int>(json_array_get_int_element(array, elem2));
112                 anchorParam.strides.push_back(stride);
113                 LOGI("stride: %d", stride);
114         }
115
116         return MEDIA_VISION_ERROR_NONE;
117 }
118
119 float DecodeInfo::CalculateScale(float min, float max, int index, int maxStride)
120 {
121         return min + (max - min) * 1.0 * index / (maxStride - 1.0f);
122 }
123
124 bool DecodeInfo::IsFixedAnchorSize()
125 {
126         return anchorParam.isFixedAnchorSize;
127 }
128
129 bool DecodeInfo::IsExponentialBoxScale()
130 {
131         return anchorParam.isExponentialBoxScale;
132 }
133
134 float DecodeInfo::GetAnchorXscale()
135 {
136         return anchorParam.xScale;
137 }
138
139 float DecodeInfo::GetAnchorYscale()
140 {
141         return anchorParam.yScale;
142 }
143
144 float DecodeInfo::GetAnchorWscale()
145 {
146         return anchorParam.wScale;
147 }
148
149 float DecodeInfo::GetAnchorHscale()
150 {
151         return anchorParam.hScale;
152 }
153
154 int DecodeInfo::GenerateAnchor()
155 {
156         if (anchorParam.strides.empty() || anchorParam.aspectRatios.empty()) {
157                 LOGE("Invalid anchor parameters");
158                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
159         }
160
161         int layerId = 0;
162         ClearAnchorBox();
163         while (layerId < anchorParam.numLayers) {
164                 std::vector<float> anchorHeight;
165                 std::vector<float> anchorWidth;
166                 std::vector<float> aspectRatios;
167                 std::vector<float> scales;
168
169                 int lastSameStrideLayer = layerId;
170                 std::vector<float>::iterator iter1, iter2;
171                 while ((lastSameStrideLayer < anchorParam.numLayers) &&
172                            (anchorParam.strides[lastSameStrideLayer] == anchorParam.strides[layerId])) {
173                         const float scale = CalculateScale(anchorParam.minScale, anchorParam.maxScale, lastSameStrideLayer,
174                                                                                            anchorParam.strides.size());
175
176                         if (lastSameStrideLayer == 0 && anchorParam.isReduceBoxedInLowestLayer) {
177                                 aspectRatios.push_back(1.0);
178                                 aspectRatios.push_back(2.0);
179                                 aspectRatios.push_back(0.5);
180                                 scales.push_back(0.1);
181                                 scales.push_back(scale);
182                                 scales.push_back(scale);
183                         } else {
184                                 for (iter1 = anchorParam.aspectRatios.begin(); iter1 != anchorParam.aspectRatios.end(); ++iter1) {
185                                         aspectRatios.push_back((*iter1));
186                                         scales.push_back(scale);
187                                 }
188                                 if (anchorParam.interpolatedScaleAspectRatio > 0.0f) {
189                                         const float scaleNext = lastSameStrideLayer == static_cast<int>(anchorParam.strides.size()) - 1 ?
190                                                                                                         1.0f :
191                                                                                                         CalculateScale(anchorParam.minScale, anchorParam.maxScale,
192                                                                                                                                    lastSameStrideLayer + 1, anchorParam.strides.size());
193                                         scales.push_back(std::sqrt(scale * scaleNext));
194                                         aspectRatios.push_back(anchorParam.interpolatedScaleAspectRatio);
195                                 }
196                         }
197                         lastSameStrideLayer++;
198                 }
199
200                 for (iter1 = aspectRatios.begin(), iter2 = scales.begin();
201                          (iter1 != aspectRatios.end() && iter2 != scales.end()); ++iter1, ++iter2) {
202                         const float ratioSqrts = std::sqrt((*iter1));
203                         anchorHeight.push_back((*iter2) / ratioSqrts);
204                         anchorWidth.push_back((*iter2) * ratioSqrts);
205                 }
206
207                 const int stride = anchorParam.strides[layerId];
208                 int featureMapHeight = std::ceil(1.0f * anchorParam.inputSizeHeight / stride);
209                 int featureMapWidth = std::ceil(1.0f * anchorParam.inputSizeWidth / stride);
210
211                 for (int y = 0; y < featureMapHeight; ++y) {
212                         for (int x = 0; x < featureMapWidth; ++x) {
213                                 for (int anchorId = 0; anchorId < (int) anchorHeight.size(); ++anchorId) {
214                                         cv::Rect2f anchor = { cv::Point2f { (x + anchorParam.anchorOffsetX) * 1.0f / featureMapWidth,
215                                                                                                                 (y + anchorParam.anchorOffsetY) * 1.0f / featureMapHeight },
216                                                                                   anchorParam.isFixedAnchorSize ?
217                                                                                                   cv::Size2f { 1.0f, 1.0f } :
218                                                                                                   cv::Size2f { anchorWidth[anchorId], anchorWidth[anchorId] } };
219                                         AddAnchorBox(anchor);
220                                 }
221                         }
222                 }
223                 layerId = lastSameStrideLayer;
224         }
225
226         if (IsAnchorBoxEmpty()) {
227                 LOGE("Anchor boxes are empty");
228                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
229         }
230
231         return MEDIA_VISION_ERROR_NONE;
232 }
233
234 int DecodeInfo::ParseNms(JsonObject *root)
235 {
236         if (!json_object_has_member(root, "nms")) {
237                 LOGI("nms is empty. skip it");
238                 return MEDIA_VISION_ERROR_NONE;
239         }
240
241         JsonObject *object = json_object_get_object_member(root, "nms");
242         try {
243                 nmsParam.mode = GetSupportedType(object, "mode", nmsParam.supportedBoxNmsTypes);
244         } catch (const std::exception &e) {
245                 LOGE("Invalid %s", e.what());
246                 return MEDIA_VISION_ERROR_INVALID_OPERATION;
247         }
248
249         nmsParam.iouThreshold = static_cast<float>(json_object_get_double_member(object, "iou_threshold"));
250
251         return MEDIA_VISION_ERROR_NONE;
252 }
253
254 int DecodeInfo::GetNmsMode()
255 {
256         return nmsParam.mode;
257 }
258
259 float DecodeInfo::GetNmsIouThreshold()
260 {
261         return nmsParam.iouThreshold;
262 }
263
264 int DecodeInfo::ParseRotate(JsonObject *root)
265 {
266         if (!json_object_has_member(root, "rotate")) {
267                 LOGI("rotate is empty. skip it");
268                 return MEDIA_VISION_ERROR_NONE;
269         }
270
271         JsonObject *object = json_object_get_object_member(root, "rotate");
272         rotParam.baseAngle = static_cast<float>(json_object_get_double_member(object, "base_angle"));
273         rotParam.startPointIndex = static_cast<int>(json_object_get_int_member(object, "start_point_index"));
274         rotParam.endPointIndex = static_cast<int>(json_object_get_int_member(object, "end_point_index"));
275
276         return MEDIA_VISION_ERROR_NONE;
277 }
278
279 int DecodeInfo::GetRotStartPointIndex()
280 {
281         return rotParam.startPointIndex;
282 }
283
284 int DecodeInfo::GetRotEndPointIndex()
285 {
286         return rotParam.endPointIndex;
287 }
288
289 float DecodeInfo::GetBaseAngle()
290 {
291         return rotParam.baseAngle;
292 }
293
294 int DecodeInfo::GetRoiMode()
295 {
296         return roiOptParam.mode;
297 }
298
299 int DecodeInfo::GetRoiStartPointIndex()
300 {
301         return roiOptParam.startPointIndex;
302 }
303
304 int DecodeInfo::GetRoiEndPointIndex()
305 {
306         return roiOptParam.endPointIndex;
307 }
308
309 int DecodeInfo::GetRoiCenterPointIndex()
310 {
311         return roiOptParam.centerPointIndex;
312 }
313
314 float DecodeInfo::GetShiftX()
315 {
316         return roiOptParam.shiftX;
317 }
318
319 float DecodeInfo::GetShiftY()
320 {
321         return roiOptParam.shiftY;
322 }
323
324 float DecodeInfo::GetScaleX()
325 {
326         return roiOptParam.scaleX;
327 }
328
329 float DecodeInfo::GetScaleY()
330 {
331         return roiOptParam.scaleY;
332 }
333
334 int DecodeInfo::ParseRoiOption(JsonObject *root)
335 {
336         if (!json_object_has_member(root, "roi")) {
337                 LOGI("roi is empty. skip it");
338                 return MEDIA_VISION_ERROR_NONE;
339         }
340
341         JsonObject *object = json_object_get_object_member(root, "roi");
342         roiOptParam.startPointIndex = static_cast<int>(json_object_get_int_member(object, "start_point_index"));
343         roiOptParam.endPointIndex = static_cast<int>(json_object_get_int_member(object, "end_point_index"));
344         roiOptParam.centerPointIndex = static_cast<int>(json_object_get_int_member(object, "center_point_index"));
345         roiOptParam.shiftX = static_cast<float>(json_object_get_double_member(object, "shift_x"));
346         roiOptParam.shiftY = static_cast<float>(json_object_get_double_member(object, "shift_y"));
347         roiOptParam.scaleX = static_cast<float>(json_object_get_double_member(object, "scale_x"));
348         roiOptParam.scaleY = static_cast<float>(json_object_get_double_member(object, "scale_y"));
349         roiOptParam.mode = static_cast<int>(json_object_get_int_member(object, "scale_mode"));
350
351         return MEDIA_VISION_ERROR_NONE;
352 }
353
354 /**
355  * @ref https://wikidocs.net/163607
356  */
357 int DecodeInfo::GenerateYOLOAnchor()
358 {
359         constexpr int maxAnchorPerCell = 3;
360         LOGI("ENTER");
361         auto anchorIndex = vAnchorBoxes.size();
362         std::vector<cv::Rect2f> cal;
363         auto stride = anchorParam.strides[anchorIndex];
364         auto gridHeight = anchorParam.inputSizeHeight / stride;
365         auto gridWidth = anchorParam.inputSizeWidth / stride;
366
367         for (int y = 0; y < gridHeight; ++y) {
368                 for (int x = 0; x < gridWidth; ++x) {
369                         for (int anchorPerCell = 0; anchorPerCell < maxAnchorPerCell; ++anchorPerCell) {
370                                 cv::Rect2f anchor = { cv::Point2f { (static_cast<float>(x) + anchorParam.anchorOffsetX),
371                                                                                                         (static_cast<float>(y) + anchorParam.anchorOffsetY) },
372                                                                           cv::Size2f { anchorParam.vxScales[anchorPerCell] * static_cast<float>(stride),
373                                                                                                    anchorParam.vyScales[anchorPerCell] * static_cast<float>(stride) } };
374                                 cal.push_back(anchor);
375                         }
376                 }
377         }
378         anchorParam.totalAnchors += cal.size();
379         vAnchorBoxes.push_back(cal);
380
381         LOGI("LEAVE");
382         return MEDIA_VISION_ERROR_NONE;
383 }