[WIP-02] add face landmark
authorTae-Young Chung <ty83.chung@samsung.com>
Wed, 17 Apr 2024 08:05:10 +0000 (17:05 +0900)
committerTae-Young Chung <ty83.chung@samsung.com>
Wed, 17 Apr 2024 08:22:28 +0000 (17:22 +0900)
- SmartPointer -> GazeEstimator -> inferenceServiceMulti ->
FaceDetection -> FaceLandmark

- test_smartpointer

Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
17 files changed:
common/include/SingleoCommonTypes.h
cvFace.png [new file with mode: 0644]
inference/backends/CMakeLists.txt
inference/backends/mediavision/include/MvFaceDetection.h
inference/backends/mediavision/include/MvFaceDetectionMulti.h
inference/backends/mediavision/include/MvFaceLandmarkDetectionMulti.h [new file with mode: 0644]
inference/backends/mediavision/include/MvObjectDetection.h
inference/backends/mediavision/src/MvFaceDetection.cpp
inference/backends/mediavision/src/MvFaceDetectionMulti.cpp
inference/backends/mediavision/src/MvFaceLandmarkDetectionMulti.cpp [new file with mode: 0644]
inference/backends/mediavision/src/MvObjectDetection.cpp
inference/include/IInferenceTaskInterface.h
inference/include/SingleoInferenceTypes.h
inference/src/InferenceServiceDefault.cpp
inference/src/InferenceServiceMulti.cpp
services/smart_pointer/include/GazeEstimator.h
services/smart_pointer/src/GazeEstimator.cpp

index 43758283dbe4eadfd44d36084b5bb211b256ad41..d96992967466388294d6ab5195dbe5e00cba3e06 100644 (file)
@@ -28,6 +28,12 @@ struct Rect {
        int bottom {};
 };
 
+struct Point {
+       unsigned int x {};
+       unsigned int y {};
+       unsigned int z {};
+};
+
 using VecRect = std::vector<Rect>;
 
 enum class DataType { NONE, FILE, IMAGE, RAW };
@@ -89,6 +95,13 @@ struct FdResultType : public BaseResultType {
        // TODO
 };
 
+struct FldResultType : public BaseResultType {
+       std::vector<Point> _landmarks;
+       FldResultType() : BaseResultType(ResultType::LANDMARK)
+       {}
+       // TODO
+};
+
 enum class ServiceType { NONE, AUTO_ZOOM, SMART_POINTER };
 
 enum class InputFeedType { NONE, CAMERA, SCREEN_CAPTURE };
diff --git a/cvFace.png b/cvFace.png
new file mode 100644 (file)
index 0000000..d6c6bae
Binary files /dev/null and b/cvFace.png differ
index 5e07a50b17ce6e0764c6e3a60f683ec1056efcc8..1685bc0569b7b6014c3214b413788a139c1a3964 100644 (file)
@@ -5,10 +5,10 @@ FILE(GLOB MEDIAVISION_SOURCE_FILES "${PROJECT_SOURCE_DIR}/mediavision/src/*.cpp"
 ADD_LIBRARY(${PROJECT_NAME} SHARED ${MEDIAVISION_SOURCE_FILES})
 
 FIND_PACKAGE(PkgConfig REQUIRED)
-PKG_CHECK_MODULES(${PROJECT_NAME}_DEP REQUIRED capi-media-vision)
+PKG_CHECK_MODULES(${PROJECT_NAME}_DEP REQUIRED capi-media-vision opencv)
 
 TARGET_INCLUDE_DIRECTORIES(${PROJECT_NAME} PRIVATE ../include ../../common/include ../../log/include mediavision/include /usr/include/media)
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} PRIVATE mv_common singleo_log mv_inference mv_object_detection)
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} PRIVATE mv_common singleo_log ${${PROJECT_NAME}_DEP_LIBRARIES} )
 
 # Install the library  
 INSTALL(TARGETS ${PROJECT_NAME} DESTINATION ${LIB_INSTALL_DIR})
index 168d6023a13f9524a88dd7981042e30d592ebdc5..60c9bde908919488dcb43f38950d5e69f96c1d17 100644 (file)
@@ -39,7 +39,7 @@ public:
 
        void configure() override;
        void prepare() override;
-       void invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input = BaseResultType(ResultType::NONE)) override;
+       void invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async) override;
        BaseResultType &result() override;
 };
 
index b771becf919e26c6faa7c1bb9df034da03fc4b9a..ef77a0bb75266069bc3182c215caa31021bec3a5 100644 (file)
@@ -41,7 +41,7 @@ public:
 
        void configure() override;
        void prepare() override;
-       void invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input = BaseResultType(ResultType::NONE)) override;
+       void invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async) override;
        BaseResultType &result() override;
 };
 
diff --git a/inference/backends/mediavision/include/MvFaceLandmarkDetectionMulti.h b/inference/backends/mediavision/include/MvFaceLandmarkDetectionMulti.h
new file mode 100644 (file)
index 0000000..30b692c
--- /dev/null
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FACE_LANDMARK_DETECTION_MULTI_H__
+#define __FACE_LANDMARK_DETECTION_MULTI_H__
+
+#include <thread>
+#include "IInferenceTaskInterface.h"
+#include "mv_facial_landmark_internal.h"
+#include "SingleoCommonTypes.h"
+
+namespace singleo
+{
+namespace inference
+{
+namespace backends
+{
+class MvFaceLandmarkDetectionMulti : public IInferenceTaskInterface
+{
+private:
+       mv_facial_landmark_h _handle {};
+       FldResultType _output_data {};
+    mv_source_h _mv_src {};
+
+       void cropFaceRegion(BaseDataType& input, ImageDataType& output, BaseResultType& roi);
+
+public:
+       MvFaceLandmarkDetectionMulti();
+       virtual ~MvFaceLandmarkDetectionMulti();
+
+       void configure() override;
+       void prepare() override;
+       void invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async) override;
+       BaseResultType &result() override;
+};
+
+} // backends
+} // inference
+} // singleo
+
+#endif
index c29b1b66cae7972213b2d1dcf7e10b7b044e7346..04d70f093d652826d8af776e571e8f78819cdc5d 100644 (file)
@@ -39,7 +39,7 @@ public:
 
        void configure() override;
        void prepare() override;
-       void invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input = BaseResultType(ResultType::NONE)) override;
+       void invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async) override;
        BaseResultType &result() override;
 };
 
index 42b0fd27d45013c191ed52862920402e626802fc..1acf20eeba5d1879fb24b05405a6dbeea940958e 100644 (file)
@@ -53,7 +53,7 @@ void MvFaceDetection::prepare()
                throw runtime_error("Fail to prepare face detection.");
 }
 
-void MvFaceDetection::invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input)
+void MvFaceDetection::invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async)
 {
        ImageDataType &data = dynamic_cast<ImageDataType &>(input);
 
index 3f84739f00829661e13f666ae9cd2fd098dc5794..0877bbbfc65a1129e1da42495e26270e6f5f88df 100644 (file)
@@ -59,7 +59,7 @@ void MvFaceDetectionMulti::prepare()
 }
 
 
-void MvFaceDetectionMulti::invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input)
+void MvFaceDetectionMulti::invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async)
 {
        ImageDataType &data = dynamic_cast<ImageDataType &>(input);
 
@@ -103,6 +103,7 @@ BaseResultType &MvFaceDetectionMulti::result()
                        throw runtime_error("Fail to get face detection bound box.");
 
                _output_data._rects.push_back(rect);
+               SINGLEO_LOGD("idx[%2zd]: (%3zd, %3zd, %3zd, %3zd)", idx, rect.left, rect.top, rect.right, rect.bottom);
        }
 
        return _output_data;
diff --git a/inference/backends/mediavision/src/MvFaceLandmarkDetectionMulti.cpp b/inference/backends/mediavision/src/MvFaceLandmarkDetectionMulti.cpp
new file mode 100644 (file)
index 0000000..59100b4
--- /dev/null
@@ -0,0 +1,134 @@
+/**
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+#include <opencv2/core.hpp>
+#include "SingleoInputManager.h"
+#include "MvFaceLandmarkDetectionMulti.h"
+#include "SingleoLog.h"
+
+using namespace std;
+
+namespace singleo
+{
+namespace inference
+{
+namespace backends
+{
+MvFaceLandmarkDetectionMulti::MvFaceLandmarkDetectionMulti()
+{
+       int ret = mv_facial_landmark_create(&_handle);
+       if (ret != MEDIA_VISION_ERROR_NONE)
+               throw runtime_error("Fail to create face landmark detection handle.");
+    
+       ret = mv_create_source(&_mv_src);
+       if (ret != MEDIA_VISION_ERROR_NONE)
+               throw runtime_error("Fail to create mv source.");
+}
+
+MvFaceLandmarkDetectionMulti::~MvFaceLandmarkDetectionMulti()
+{
+       mv_facial_landmark_destroy(_handle);
+    mv_destroy_source(_mv_src);
+}
+
+void MvFaceLandmarkDetectionMulti::configure()
+{
+       int ret = mv_facial_landmark_configure(_handle);
+       if (ret != MEDIA_VISION_ERROR_NONE)
+               throw runtime_error("Fail to configure face landmark detection.");
+}
+
+void MvFaceLandmarkDetectionMulti::prepare()
+{
+       int ret = mv_facial_landmark_prepare(_handle);
+       if (ret != MEDIA_VISION_ERROR_NONE)
+               throw runtime_error("Fail to prepare face landmark detection.");
+}
+
+void MvFaceLandmarkDetectionMulti::cropFaceRegion(BaseDataType& input, ImageDataType& output, BaseResultType& roi)
+{
+       if (roi._rects.empty()) {
+               output = dynamic_cast<ImageDataType&>(input);
+       } else {
+               ImageDataType& data = dynamic_cast<ImageDataType&>(input);
+               cv::Mat cvData(cv::Size(data.width, data.height), CV_MAKETYPE(CV_8U, data.byte_per_pixel), data.ptr);
+               cv::Mat cvFace = cvData(cv::Rect(roi._rects[0].left, roi._rects[0].top, roi._rects[0].right - roi._rects[0].left, roi._rects[0].bottom - roi._rects[0].top));
+               output.pixel_format = data.pixel_format;
+               output.byte_per_pixel = data.byte_per_pixel;
+               output.width = cvFace.cols;
+               output.height = cvFace.rows;
+               output.ptr = cvFace.data;
+       }
+}
+
+void MvFaceLandmarkDetectionMulti::invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async)
+{
+       SINGLEO_LOGD("Invoke FLD");
+       if (input._data_type != DataType::IMAGE) {
+               SINGLEO_LOGE("Invalid input type.");
+               throw invalid_argument("Input type not support.");
+       }
+
+       SINGLEO_LOGD("FLD casting");
+       ImageDataType data;
+       SINGLEO_LOGD("cropping..");
+       cropFaceRegion(input, data, auxiliary_input);
+
+       try {
+               int ret = mv_source_fill_by_buffer(_mv_src, data.ptr, data.width * data.height * data.byte_per_pixel, data.width,
+                                                                          data.height, MEDIA_VISION_COLORSPACE_RGB888);
+               if (ret != MEDIA_VISION_ERROR_NONE)
+                       throw runtime_error("Fail to convert to mv source.");
+
+               ret = mv_facial_landmark_inference(_handle, _mv_src);
+               if (ret != MEDIA_VISION_ERROR_NONE)
+                       throw runtime_error("Fail to invoke face landmark detection.");
+        
+       } catch (std::runtime_error &e) {
+               SINGLEO_LOGE("%s", e.what());
+       }
+}
+
+BaseResultType &MvFaceLandmarkDetectionMulti::result()
+{
+       unsigned long frame_number;
+       unsigned int result_cnt;
+
+       int ret = mv_facial_landmark_get_result_count(_handle, &frame_number, &result_cnt);
+       if (ret != MEDIA_VISION_ERROR_NONE)
+               throw runtime_error("Fail to get face landmark detection result count.");
+
+       _output_data._landmarks.clear();
+       _output_data._frame_number = frame_number;
+
+       for (unsigned int idx = 0; idx < result_cnt; ++idx) {
+               Point landmark;
+
+               ret = mv_facial_landmark_get_position(_handle, idx, &landmark.x, &landmark.y);
+               if (ret != MEDIA_VISION_ERROR_NONE)
+                       throw runtime_error("Fail to get face landmark detection bound box.");
+
+               _output_data._landmarks.push_back(landmark);
+               SINGLEO_LOGD("idx[%2zd]: (%3zd, %3zd)", idx, landmark.x, landmark.y);
+       }
+
+       return _output_data;
+}
+
+}
+}
+}
index 593523c0434598d9551cadeef74425d230a606c1..1aef99169e9e72347f53f832fb806623e5e2607a 100644 (file)
@@ -53,7 +53,7 @@ void MvObjectDetection::prepare()
                throw runtime_error("Fail to prepare object detection.");
 }
 
-void MvObjectDetection::invoke(BaseDataType &input, bool async, BaseResultType auxiliary_input)
+void MvObjectDetection::invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async)
 {
        ImageDataType &data = dynamic_cast<ImageDataType &>(input);
 
index a8de2564242f834ac342d8b66ba8bcd448fab795..5bb8e84413169e803faeca98c0395bdf31d2b74d 100644 (file)
@@ -31,7 +31,7 @@ public:
 
        virtual void configure() = 0;
        virtual void prepare() = 0;
-       virtual void invoke(BaseDataType &input, bool async = false, BaseResultType auxiliary_input = BaseResultType(ResultType::NONE)) = 0;
+       virtual void invoke(BaseDataType &input, BaseResultType &auxiliary_input, bool async = false) = 0;
        virtual BaseResultType &result() = 0;
 };
 
index 06c41550701b78f53b4683b41938c79e89ec048d..668af21213a6545d14a48f32372d8654bc8ca98a 100644 (file)
@@ -23,7 +23,7 @@ namespace singleo
 {
 namespace inference
 {
-enum class TaskType { NONE, IMAGE_CLASSIFICATION, OBJECT_DETECTION, FACE_DETECTION };
+enum class TaskType { NONE, IMAGE_CLASSIFICATION, OBJECT_DETECTION, FACE_DETECTION, FACE_LANDMARK_DETECTION };
 
 } // inference
 } // singleo
index b6f058de991994434c396db518c3422b879f2d54..5737d40bdfda4cb7965127dc8098f642eb5346fa 100644 (file)
@@ -56,7 +56,7 @@ void InferenceServiceDefault::prepare()
 
 void InferenceServiceDefault::invoke(BaseDataType &input, bool async)
 {
-       _task->invoke(input, async);
+       _task->invoke(input, BaseResultType{ResultType::NONE}, async);
 }
 
 BaseResultType &InferenceServiceDefault::result()
index 8e9d20a617705a4655a4a1774360436eada0b635..531a0bcaa63e6fd4f18b1841c899149a9654229f 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "InferenceServiceMulti.h"
 #include "MvFaceDetectionMulti.h"
+#include "MvFaceLandmarkDetectionMulti.h"
 #include "SingleoLog.h"
 
 using namespace std;
@@ -32,6 +33,11 @@ InferenceServiceMulti::InferenceServiceMulti(std::vector<TaskType> task_types)
         switch (type) {
         case TaskType::FACE_DETECTION:
             _tasks.push_back(std::make_pair(type, make_unique<MvFaceDetectionMulti>()));
+            SINGLEO_LOGI("Add task %s", "FACE_DETECTION");
+            break;
+        case TaskType::FACE_LANDMARK_DETECTION:
+            _tasks.push_back(std::make_pair(type, make_unique<MvFaceLandmarkDetectionMulti>()));
+            SINGLEO_LOGI("Add task %s", "FACE_LANDMARK_DETECTION");
             break;
         }
     }
@@ -56,8 +62,8 @@ void InferenceServiceMulti::invoke(BaseDataType &input, bool async)
 {
     BaseResultType result(ResultType::NONE);
     for (auto task = _tasks.begin(); task!= _tasks.end(); ++task) {
+        task->second->invoke(input, result, async);
         SINGLEO_LOGD("%d task",task->first);
-        task->second->invoke(input, async, result);
         if ((task+1)!= _tasks.end()) {
             result = task->second->result();
         }
index 642cc54f869caccc3960d7aa1618c8dc1f0294c6..4a0439da1bd94ce111e8b6ecce36822138654c05 100644 (file)
@@ -35,7 +35,7 @@ private:
     std::unique_ptr<singleo::inference::IInferenceServiceInterface> _face_estimator;
     std::unique_ptr<singleo::input::IInputService> _input_service;
 
-    const std::vector<inference::TaskType> _tasks { inference::TaskType::FACE_DETECTION };
+    const std::vector<inference::TaskType> _tasks { inference::TaskType::FACE_DETECTION, inference::TaskType::FACE_LANDMARK_DETECTION };
 
 public:
     explicit GazeEstimator(input::InputConfigBase &config);
index b8be354720abd73716e5f5189a1523dd6e08f88b..7fab6ffd5a7391ed2fa1a0ac94c98a3a1303c3f9 100644 (file)
@@ -46,10 +46,16 @@ PoseVector GazeEstimator::estimateHeadpose(BaseDataType &input)
 {
     _face_estimator->invoke(input);
 
-    auto &headPose = _face_estimator->result();
+    SINGLEO_LOGD("Invoke done");
+    auto &result = _face_estimator->result();
+    SINGLEO_LOGD("Result done");
+    if (!result._rects.empty())
+        SINGLEO_LOGD("ROI: %d, %d, %d,%d",
+                result._rects[0].top, result._rects[0].left, result._rects[0].bottom, result._rects[0].right);
 
-    SINGLEO_LOGI("ROI: %d, %d, %d,%d",
-                headPose._rects[0].top, headPose._rects[0].left, headPose._rects[0].bottom, headPose._rects[0].right);
+    auto &headPose = dynamic_cast<FldResultType&>(result);
+    SINGLEO_LOGD("Landmark: %zd, %zd",
+                headPose._landmarks[0].x, headPose._landmarks[0].y);
     return PoseVector{-1, -1, -1};
 }