Port Yolov5 Object Detection model

author Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>

Tue, 18 Mar 2025 10:54:00 +0000 (19:54 +0900)

committer Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>

Tue, 25 Mar 2025 02:22:57 +0000 (11:22 +0900)
author Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>
Tue, 18 Mar 2025 10:54:00 +0000 (19:54 +0900)
committer Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>
Tue, 25 Mar 2025 02:22:57 +0000 (11:22 +0900)
diff --git a/mv_machine_learning/object_detection/include/object_detection_adapter.h b/mv_machine_learning/object_detection/include/object_detection_adapter.h

index 1d875618ccf912438c19d796bf07d0c01e7152f2..05345460e47c98d674f8ace95cdc5ae2773dcba5 100644 (file)
--- a/mv_machine_learning/object_detection/include/object_detection_adapter.h
+++ b/mv_machine_learning/object_detection/include/object_detection_adapter.h
@@ -23,6 +23,7 @@
  #include "itask.h"
  #include "mobilenet_v1_ssd.h"
  #include "mobilenet_v2_ssd.h"
+#include "yolov5.h"
  
  namespace mediavision
  {
diff --git a/mv_machine_learning/object_detection/include/object_detection_type.h b/mv_machine_learning/object_detection/include/object_detection_type.h

index 208403541d9d317ef02e4d32ec7069f4da593bdb..1cf643df819c6c285dbe1d778c2dd2fc8cf7b003 100644 (file)
--- a/mv_machine_learning/object_detection/include/object_detection_type.h
+++ b/mv_machine_learning/object_detection/include/object_detection_type.h
@@ -61,7 +61,8 @@ enum class ObjectDetectionTaskType {
         MOBILENET_V2_SSD,
         FD_MOBILENET_V1_SSD,
         OD_PLUGIN,
-       FD_PLUGIN
+       FD_PLUGIN,
+       YOLO_V5,
         // TODO
  };
  
diff --git a/mv_machine_learning/object_detection/include/yolov5.h b/mv_machine_learning/object_detection/include/yolov5.h

new file mode 100644 (file)

index 0000000..ca19fcb
--- /dev/null
+++ b/mv_machine_learning/object_detection/include/yolov5.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __YOLOV5_H__
+#define __YOLOV5_H__
+
+#include <string>
+#include <memory>
+#include <mv_common.h>
+#include "mv_private.h"
+
+#include "object_detection.h"
+#include <mv_inference_type.h>
+#include <dlog.h>
+
+namespace mediavision
+{
+namespace machine_learning
+{
+class YoloV5 : public ObjectDetection
+{
+private:
+       ObjectDetectionResult _result;
+
+public:
+       YoloV5(ObjectDetectionTaskType task_type);
+       ~YoloV5();
+
+       ObjectDetectionResult &result() override;
+};
+
+} // machine_learning
+} // mediavision
+
+#endif
+\ No newline at end of file
diff --git a/mv_machine_learning/object_detection/src/object_detection_adapter.cpp b/mv_machine_learning/object_detection/src/object_detection_adapter.cpp

index cd35ec8d2ba6bbf01de392f9093f465602f664f7..73e60715cac97cfd816bfaa4ef83e5ed3dfd2a0b 100644 (file)
--- a/mv_machine_learning/object_detection/src/object_detection_adapter.cpp
+++ b/mv_machine_learning/object_detection/src/object_detection_adapter.cpp
@@ -64,6 +64,8 @@ template<typename T, typename V> void ObjectDetectionAdapter<T, V>::create(Objec
                 _object_detection = make_unique<MobilenetV2Ssd>(task_type);
         else if (task_type == ObjectDetectionTaskType::OD_PLUGIN || task_type == ObjectDetectionTaskType::FD_PLUGIN)
                 _object_detection = make_unique<ObjectDetectionExternal>(task_type);
+       else if (task_type == ObjectDetectionTaskType::YOLO_V5)
+               _object_detection = make_unique<YoloV5>(task_type);
         // TODO.
  }
  
@@ -75,8 +77,6 @@ ObjectDetectionTaskType ObjectDetectionAdapter<T, V>::convertToTaskType(string m
  
         transform(model_name.begin(), model_name.end(), model_name.begin(), ::toupper);
  
-       ObjectDetectionTaskType task_type = ObjectDetectionTaskType::OBJECT_DETECTION_TASK_NONE;
-
         if (model_name == "OD_PLUGIN")
                 return ObjectDetectionTaskType::OD_PLUGIN;
         else if (model_name == "FD_PLUGIN")
@@ -85,6 +85,8 @@ ObjectDetectionTaskType ObjectDetectionAdapter<T, V>::convertToTaskType(string m
                 return ObjectDetectionTaskType::MOBILENET_V1_SSD;
         else if (model_name == "MOBILENET_V2_SSD")
                 return ObjectDetectionTaskType::MOBILENET_V2_SSD;
+       else if (model_name == "YOLO_V5")
+               return ObjectDetectionTaskType::YOLO_V5;
         // TODO.
  
         throw InvalidParameter("Invalid object detection model name.");
diff --git a/mv_machine_learning/object_detection/src/yolov5.cpp b/mv_machine_learning/object_detection/src/yolov5.cpp

new file mode 100644 (file)

index 0000000..523ff45
--- /dev/null
+++ b/mv_machine_learning/object_detection/src/yolov5.cpp
@@ -0,0 +1,193 @@
+/**
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+#include <map>
+#include <algorithm>
+
+#include "machine_learning_exception.h"
+#include "mv_object_detection_config.h"
+#include "yolov5.h"
+#include "Postprocess.h"
+
+using namespace std;
+using namespace mediavision::inference;
+using namespace mediavision::machine_learning::exception;
+
+namespace mediavision
+{
+namespace machine_learning
+{
+YoloV5::YoloV5(ObjectDetectionTaskType task_type) : ObjectDetection(task_type), _result()
+{}
+
+YoloV5::~YoloV5()
+{}
+
+static bool compareScore(const Box &box0, const Box &box1)
+{
+       return box0.score > box1.score;
+}
+
+static float calcIntersectionOverUnion(const Box &box0, const Box &box1, float input_width, float input_height)
+{
+       float area0 = box0.location.width * box0.location.height;
+       float area1 = box1.location.width * box1.location.height;
+
+       if (area0 <= 0.0f || area1 <= 0.0f)
+               return 0.0f;
+
+       float xmin0 = max(box0.location.x - box0.location.width * 0.5f, 0.0f);
+       float ymin0 = max(box0.location.y - box0.location.height * 0.5f, 0.0f);
+       float xmax0 = min(box0.location.x + box0.location.width * 0.5f, input_width);
+       float ymax0 = min(box0.location.y + box0.location.height * 0.5f, input_height);
+       float xmin1 = max(box1.location.x - box1.location.width * 0.5f, 0.0f);
+       float ymin1 = max(box1.location.y - box1.location.height * 0.5f, 0.0f);
+       float xmax1 = min(box1.location.x + box1.location.width * 0.5f, input_width);
+       float ymax1 = min(box1.location.y + box1.location.height * 0.5f, input_height);
+
+       float intersectXmin = max(xmin0, xmin1);
+       float intersectYmin = max(ymin0, ymin1);
+       float intersectXmax = min(xmax0, xmax1);
+       float intersectYmax = min(ymax0, ymax1);
+
+       float intersectArea = max((intersectYmax - intersectYmin), 0.0f) * max((intersectXmax - intersectXmin), 0.0f);
+       return intersectArea / (area0 + area1 - intersectArea);
+}
+
+ObjectDetectionResult &YoloV5::result()
+{
+       // Clear _result object because result() function can be called every time user wants
+       // so make sure to clear existing result data before getting the data again.
+       memset(reinterpret_cast<void *>(&_result), 0, sizeof(_result));
+
+       vector<string> names;
+       ObjectDetection::getOutputNames(names);
+
+       for (auto &name : names)
+               LOGI("output name: %s", name.c_str());
+
+       vector<float> output_tensor;
+       ObjectDetection::getOutputTensor(names[0], output_tensor);
+
+       auto scoreMetaInfo = _parser->getOutputMetaMap().at(names[0]);
+       auto decodingScore = static_pointer_cast<DecodingScore>(scoreMetaInfo->decodingTypeMap[DecodingType::SCORE]);
+       auto decodingBox = static_pointer_cast<DecodingBox>(scoreMetaInfo->decodingTypeMap[DecodingType::BOX]);
+       auto boxNmsParam = static_pointer_cast<BoxNmsParam>(decodingBox->decodingInfoMap[BoxDecodingType::NMS]);
+       vector<unsigned int> &order = decodingBox->order; // order of (bx, by, bw, bh)
+
+       int input_width = _inference->getInputWidth();
+       int input_height = _inference->getInputHeight();
+
+       int source_width = _preprocess.getImageWidth()[0];
+       int source_height = _preprocess.getImageHeight()[0];
+
+       float w_ratio = (source_width / (float) input_width);
+       float h_ratio = (source_height / (float) input_height);
+
+       /*
+        Output dimensions 1xBx85, B is the number of predicted boxes
+        Each box has 85 numbers : (bx, by, bw, bh, confidence, ...class scores for 80 classes.......)
+        (bx, by) = absolute centre coordinates of the box
+        (bw, bh) = absolute width and height of the box
+    */
+       int box_offset = 5 + _labels.size();
+
+       vector<Box> boxes;
+       float bx, by, bw, bh, confidence, score, logit;
+       int label;
+
+       for (int idx = 0; idx < (int) output_tensor.size(); idx += box_offset) {
+               confidence = output_tensor[idx + 4];
+               if (confidence <= decodingScore->threshold)
+                       continue;
+
+               bx = output_tensor[idx + order[0]];
+               by = output_tensor[idx + order[1]];
+               bw = output_tensor[idx + order[2]];
+               bh = output_tensor[idx + order[3]];
+
+               score = 0;
+               label = 0;
+               for (int k = 5; k < box_offset; k++) {
+                       logit = output_tensor[idx + k];
+                       if (logit > score) {
+                               label = k - 5;
+                               score = logit;
+                       }
+               }
+
+               Box box = { .index = label, .score = confidence, .location = cv::Rect2f(bx, by, bw, bh) }; // x, y, w, h
+               boxes.push_back(box);
+       }
+
+       //sort boxes by confidence in descending order
+       sort(boxes.begin(), boxes.end(), compareScore);
+
+       //Non Maximal Suppression (NMS), class agnostic : boxes with same label are compared against each other
+       bool isIgnore;
+       vector<Box> candidate_box_vec;
+
+       for (auto &decoded_box : boxes) {
+               isIgnore = false;
+
+               for (auto candidate_box = candidate_box_vec.rbegin(); candidate_box != candidate_box_vec.rend();
+                        ++candidate_box) {
+                       float iouValue = calcIntersectionOverUnion(decoded_box, (*candidate_box), input_width, input_height);
+
+                       if (iouValue >= boxNmsParam->iouThreshold) {
+                               isIgnore = true;
+                               break;
+                       }
+               }
+
+               if (!isIgnore)
+                       candidate_box_vec.push_back(decoded_box);
+       }
+
+       float left, top, right, bottom;
+       int idx;
+
+       for (auto &box : candidate_box_vec) {
+               idx = _result.number_of_objects++;
+               _result.indices.push_back(idx);
+               _result.confidences.push_back(box.score);
+               _result.names.push_back(_labels[box.index]);
+
+               left = max(box.location.x - box.location.width * 0.5f, 0.0f);
+               top = max(box.location.y - box.location.height * 0.5f, 0.0f);
+               right = min(box.location.x + box.location.width * 0.5f, (float) input_width);
+               bottom = min(box.location.y + box.location.height * 0.5f, (float) input_height);
+
+               _result.left.push_back(static_cast<int>(left * w_ratio));
+               _result.top.push_back(static_cast<int>(top * h_ratio));
+               _result.right.push_back(static_cast<int>(right * w_ratio));
+               _result.bottom.push_back(static_cast<int>(bottom * h_ratio));
+
+               LOGI("idx = %d, name = %s, score = %f, %dx%d, %dx%d", idx, _result.names[idx].c_str(), _result.confidences[idx],
+                        _result.left[idx], _result.top[idx], _result.right[idx], _result.bottom[idx]);
+
+               if (decodingScore->topNumber == _result.number_of_objects)
+                       break;
+       }
+
+       LOGI("LEAVE");
+
+       return _result;
+}
+
+}
+}
author	Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>
	Tue, 18 Mar 2025 10:54:00 +0000 (19:54 +0900)
committer	Bhuvan Reddy Gangula <bhu1.gangula@samsung.com>
	Tue, 25 Mar 2025 02:22:57 +0000 (11:22 +0900)
mv_machine_learning/object_detection/include/object_detection_adapter.h		patch \| blob \| history
mv_machine_learning/object_detection/include/object_detection_type.h		patch \| blob \| history
mv_machine_learning/object_detection/include/yolov5.h	[new file with mode: 0644]	patch \| blob
mv_machine_learning/object_detection/src/object_detection_adapter.cpp		patch \| blob \| history
mv_machine_learning/object_detection/src/yolov5.cpp	[new file with mode: 0644]	patch \| blob