From: Tae-Young Chung Date: Mon, 7 Oct 2024 08:25:33 +0000 (+0900) Subject: Integrate gaze model to estimate gaze vector X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=refs%2Fheads%2Fpatch%2Fmpiifacegaze_resnetsimple14;p=platform%2Fcore%2Fapi%2Fsingleo.git Integrate gaze model to estimate gaze vector Change-Id: I5a7fc6da1031ff875b0efd38e7ebb3dedd442265 Signed-off-by: Tae-Young Chung --- diff --git a/common/include/SingleoCommonTypes.h b/common/include/SingleoCommonTypes.h index 268b2ca..1b3400c 100644 --- a/common/include/SingleoCommonTypes.h +++ b/common/include/SingleoCommonTypes.h @@ -148,7 +148,7 @@ struct RawDataType : public BaseDataType { } }; -enum class ResultType { NONE, OBJECT_DETECTION, FACE_DETECTION, FACE_LANDMARK, IMAGE_CLASSIFICATION, FACE_RECOGNITION }; +enum class ResultType { NONE, OBJECT_DETECTION, FACE_DETECTION, FACE_LANDMARK, IMAGE_CLASSIFICATION, FACE_RECOGNITION, GAZE_ESTIMATION }; struct BaseResultType { ResultType _type { ResultType::NONE }; @@ -218,6 +218,20 @@ struct FrResultType : public BaseResultType { } }; +struct GazeResultType : public BaseResultType { + GazeResultType() : BaseResultType(ResultType::GAZE_ESTIMATION) + {} + + double _pitch; + double _yaw; + std::vector _points; // 0: center, 1: gazed + + std::shared_ptr clone() override + { + return std::make_shared(*this); + } +}; + enum class ServiceType { NONE, AUTO_ZOOM, FOCUS_FINDER }; enum class InputFeedType { NONE, CAMERA, SCREEN_CAPTURE }; diff --git a/inference/backends/mediavision/CMakeLists.txt b/inference/backends/mediavision/CMakeLists.txt index 380115c..d11fb16 100644 --- a/inference/backends/mediavision/CMakeLists.txt +++ b/inference/backends/mediavision/CMakeLists.txt @@ -14,8 +14,9 @@ SET(SINGLEO_SERVICE_SOURCE_FILES ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/src/MvObjectDetection.cpp ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/src/MvImageClassification.cpp ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/src/MvFaceRecognition.cpp + ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/src/MvGazeEstimation.cpp ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/src/MvInferenceTaskFactory.cpp ) -LIST(APPEND INFERENCE_LIBRARY_LIST ${INFERENCE_LIBRARY_LIST} mv_common mv_inference mv_object_detection mv_landmark_detection mv_image_classification mv_face_recognition) +LIST(APPEND INFERENCE_LIBRARY_LIST ${INFERENCE_LIBRARY_LIST} mv_common mv_inference mv_object_detection mv_landmark_detection mv_image_classification mv_face_recognition mv_gaze_estimation) LIST(APPEND INFERENCE_HEADER_LIST ${INFERENCE_HEADER_LIST} ${INFERENCE_MEDIAVISION_BACKEND_DIRECTORY}/include /usr/include/media) diff --git a/inference/backends/mediavision/include/MvGazeEstimation.h b/inference/backends/mediavision/include/MvGazeEstimation.h new file mode 100644 index 0000000..ad23305 --- /dev/null +++ b/inference/backends/mediavision/include/MvGazeEstimation.h @@ -0,0 +1,50 @@ +/** + * Copyright (c) 2024 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MV_GAZE_ESTIMATION_H__ +#define __MV_GAZE_ESTIMATION_H__ + +#include "IInferenceTaskInterface.h" +#include "mv_gaze_estimation_internal.h" +#include "SingleoCommonTypes.h" + +namespace singleo +{ +namespace inference +{ +namespace backends +{ +class MvGazeEstimation : public IInferenceTaskInterface +{ +private: + mv_gaze_estimation_h _handle {}; + GazeResultType _output_data {}; + +public: + MvGazeEstimation(); + virtual ~MvGazeEstimation(); + + void configure() override; + void prepare() override; + void invoke(BaseDataType &input, bool async) override; + BaseResultType &result() override; +}; + +} // backends +} // inference +} // singleo + +#endif diff --git a/inference/backends/mediavision/include/MvInferenceTaskFactory.h b/inference/backends/mediavision/include/MvInferenceTaskFactory.h index f8023c1..ae7841b 100644 --- a/inference/backends/mediavision/include/MvInferenceTaskFactory.h +++ b/inference/backends/mediavision/include/MvInferenceTaskFactory.h @@ -39,6 +39,7 @@ public: std::unique_ptr createFaceRecognition() override; std::unique_ptr createHandDetection() override; std::unique_ptr createHandLandmark() override; + std::unique_ptr createGazeEstimation() override; }; } diff --git a/inference/backends/mediavision/src/MvFaceLandmark.cpp b/inference/backends/mediavision/src/MvFaceLandmark.cpp index d09c5b6..74b0db4 100644 --- a/inference/backends/mediavision/src/MvFaceLandmark.cpp +++ b/inference/backends/mediavision/src/MvFaceLandmark.cpp @@ -46,7 +46,12 @@ MvFaceLandmark::~MvFaceLandmark() void MvFaceLandmark::configure() { - int ret = mv_facial_landmark_configure(_handle); + int ret = mv_facial_landmark_set_model(_handle, "fld_u2net_160x160.tflite", "fld_u2net_160x160.json", + "", "FLD_U2NET"); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to set face landmark model."); + + ret = mv_facial_landmark_configure(_handle); if (ret != MEDIA_VISION_ERROR_NONE) throw runtime_error("Fail to configure face landmark detection."); } diff --git a/inference/backends/mediavision/src/MvGazeEstimation.cpp b/inference/backends/mediavision/src/MvGazeEstimation.cpp new file mode 100644 index 0000000..f21caa6 --- /dev/null +++ b/inference/backends/mediavision/src/MvGazeEstimation.cpp @@ -0,0 +1,122 @@ +/** + * Copyright (c) 2024 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "MvGazeEstimation.h" +#include "SingleoLog.h" + +using namespace std; + +namespace singleo +{ +namespace inference +{ +namespace backends +{ +MvGazeEstimation::MvGazeEstimation() +{ + int ret = mv_gaze_estimation_create(&_handle); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to create gaze estimation handle."); +} + +MvGazeEstimation::~MvGazeEstimation() +{ + try { + int ret = mv_gaze_estimation_destroy(_handle); + if (ret != MEDIA_VISION_ERROR_NONE) + SINGLEO_LOGE("Fail to destroy gaze estimation handle.(%d)", ret); + } catch (const std::runtime_error &e) { + SINGLEO_LOGE("Failed to destroy gaze estimation handle: %s", e.what()); + } +} + +void MvGazeEstimation::configure() +{ + int ret = mv_gaze_estimation_configure(_handle); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to configure gaze estimation."); +} + +void MvGazeEstimation::prepare() +{ + int ret = mv_gaze_estimation_prepare(_handle); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to prepare gaze estimation."); +} + +void MvGazeEstimation::invoke(BaseDataType &input, bool async) +{ + ImageDataType &data = dynamic_cast(input); + + if (data._data_type != DataType::IMAGE) { + SINGLEO_LOGE("Invalid input type."); + throw invalid_argument("Input type not support."); + } + + mv_source_h mv_src; + + int ret = mv_create_source(&mv_src); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to create mv source."); + + try { + ret = mv_source_fill_by_buffer(mv_src, data.ptr, data.width * data.height * data.byte_per_pixel, data.width, + data.height, MEDIA_VISION_COLORSPACE_RGB888); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to convert to mv source."); + + ret = mv_gaze_estimation_inference(_handle, mv_src); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to invoke gaze estimation."); + } catch (std::runtime_error &e) { + SINGLEO_LOGE("%s", e.what()); + } + + ret = mv_destroy_source(mv_src); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to destroy mv source."); +} + +BaseResultType &MvGazeEstimation::result() +{ + unsigned long frame_number; + unsigned int result_cnt; + + int ret = mv_gaze_estimation_get_result_count(_handle, &frame_number, &result_cnt); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to get gaze estimaton result count."); + + _output_data._frame_number = frame_number; + _output_data._is_empty = result_cnt == 0; + + if (!_output_data._is_empty) { + double pitch, yaw; + + ret = mv_gaze_estimation_get_direction(_handle, &pitch, &yaw); + if (ret != MEDIA_VISION_ERROR_NONE) + throw runtime_error("Fail to get gaze direction."); + + _output_data._pitch = pitch; + _output_data._yaw = yaw; + } + + return _output_data; +} + +} +} +} diff --git a/inference/backends/mediavision/src/MvInferenceTaskFactory.cpp b/inference/backends/mediavision/src/MvInferenceTaskFactory.cpp index 5485fbf..f95cf01 100644 --- a/inference/backends/mediavision/src/MvInferenceTaskFactory.cpp +++ b/inference/backends/mediavision/src/MvInferenceTaskFactory.cpp @@ -23,6 +23,7 @@ #include "MvImageClassification.h" #include "MvHandDetection.h" #include "MvHandLandmark.h" +#include "MvGazeEstimation.h" #include "SingleoLog.h" #include "SingleoException.h" @@ -72,5 +73,9 @@ std::unique_ptr MvInferenceTaskFactory::createHandLandm return make_unique(); } +std::unique_ptr MvInferenceTaskFactory::createGazeEstimation() +{ + return make_unique(); +} } } diff --git a/inference/include/IInferenceTaskFactory.h b/inference/include/IInferenceTaskFactory.h index 893084d..40a0a5f 100644 --- a/inference/include/IInferenceTaskFactory.h +++ b/inference/include/IInferenceTaskFactory.h @@ -37,6 +37,7 @@ public: virtual std::unique_ptr createFaceRecognition() = 0; virtual std::unique_ptr createHandDetection() = 0; virtual std::unique_ptr createHandLandmark() = 0; + virtual std::unique_ptr createGazeEstimation() = 0; }; } diff --git a/services/focus_finder/include/FaceShapeModelManager.h b/services/focus_finder/include/FaceShapeModelManager.h index 6e0acec..64650ff 100644 --- a/services/focus_finder/include/FaceShapeModelManager.h +++ b/services/focus_finder/include/FaceShapeModelManager.h @@ -31,17 +31,19 @@ namespace focusfinder class FaceShapeModelManager { private: - std::vector _faceShape; + std::vector _faceShape; - void loadModelFromFile(const std::string &model_path); template T ToNumber(const std::string &text); std::map > _faceShapeIndices; + void creatFaceShapeIndex(); public: + FaceShapeModelManager(); FaceShapeModelManager(const std::string &model_path); ~FaceShapeModelManager(); - const std::vector &getFaceShape(); + const std::vector &getFaceShape(); const std::vector &getFaceShapeIndices(FaceComponents component); + void loadModelFromFile(const std::string &model_path); }; } // focusfinder } // services diff --git a/services/focus_finder/include/FocusFinder.h b/services/focus_finder/include/FocusFinder.h index b0445f8..c6644e7 100644 --- a/services/focus_finder/include/FocusFinder.h +++ b/services/focus_finder/include/FocusFinder.h @@ -50,8 +50,10 @@ private: std::mutex _input_mutex; FocusFinderResult _result {}; - std::map _result_keys = { { "X", FocusFinderResultType::X }, - { "Y", FocusFinderResultType::Y } }; + std::map _result_keys = { { "CENTER_X", FocusFinderResultType::CX }, + { "CENTER_Y", FocusFinderResultType::CY }, + { "GAZE_X", FocusFinderResultType::GX }, + { "GAZE_Y", FocusFinderResultType::GY } }; bool _async_mode { false }; std::unique_ptr > _async_manager; @@ -59,7 +61,7 @@ private: void *_user_data {}; bool isKeyValid(std::string key); - void updateResult(const std::vector &axes); + void updateResult(const GazeResultType& gaze); public: FocusFinder(); diff --git a/services/focus_finder/include/FocusFinderDataTypes.h b/services/focus_finder/include/FocusFinderDataTypes.h index 0003883..df6d6f0 100644 --- a/services/focus_finder/include/FocusFinderDataTypes.h +++ b/services/focus_finder/include/FocusFinderDataTypes.h @@ -38,6 +38,7 @@ enum class FaceComponents { NOSE_BASE, LEFT_EYE_CORNERS, RIGHT_EYE_CORNERS, + MOUTH_CORNERS }; } } diff --git a/services/focus_finder/include/FocusFinderResult.h b/services/focus_finder/include/FocusFinderResult.h index 6328645..376aab9 100644 --- a/services/focus_finder/include/FocusFinderResult.h +++ b/services/focus_finder/include/FocusFinderResult.h @@ -26,15 +26,17 @@ namespace services { namespace focusfinder { -enum class FocusFinderResultType { X, Y }; +enum class FocusFinderResultType { CX, CY, GX, GY }; struct FocusFinderResult : public ServiceBaseResultType { FocusFinderResult() : ServiceBaseResultType(ServiceResultType::FOCUS_FINDER) {} unsigned int frame_number {}; bool is_focused { false }; + Point center; Point focus; - std::vector head_axes; + double pitch; + double yaw; }; } } diff --git a/services/focus_finder/include/GazeEstimator.h b/services/focus_finder/include/GazeEstimator.h index 721ba91..85653bf 100644 --- a/services/focus_finder/include/GazeEstimator.h +++ b/services/focus_finder/include/GazeEstimator.h @@ -21,6 +21,7 @@ #include "FocusFinderDataTypes.h" #include "IFaceTaskManager.h" #include "HeadPoseEstimator.h" +#include "IInferenceTaskInterface.h" namespace singleo { @@ -31,14 +32,18 @@ namespace focusfinder class GazeEstimator { private: + GazeResultType _result; std::shared_ptr _face_tasker; std::unique_ptr _head_pose_estimator; std::vector _head_axes; + std::unique_ptr _gaze_estimator; + + void drawHeadPose(FaceResult& result, ImageDataType& image, cv::Point3f& rotvec, cv::Point3f& tranvec); public: GazeEstimator(); ~GazeEstimator(); - const std::vector run(BaseDataType &input); + const GazeResultType& run(BaseDataType &input); }; } // focusfinder } // services diff --git a/services/focus_finder/include/HeadPoseEstimator.h b/services/focus_finder/include/HeadPoseEstimator.h index f239b21..f544be8 100644 --- a/services/focus_finder/include/HeadPoseEstimator.h +++ b/services/focus_finder/include/HeadPoseEstimator.h @@ -24,6 +24,7 @@ #include #include #include "SingleoCommonTypes.h" +#include "FaceShapeModelManager.h" namespace singleo { @@ -34,35 +35,52 @@ namespace focusfinder class HeadPoseEstimator { private: - std::vector _landmarks_2d; - std::vector _landmarks_3d; + std::vector _landmarks_2d; + std::vector _landmarks_3d; + std::vector _faceComponentIndices; cv::Mat _camera_matrix; + cv::Mat _camera_matrix_inv; + cv::Mat _normalized_camera_matrix; cv::Mat _camera_dist_coeff; cv::Mat _rotation_vector; cv::Mat _translation_vector; cv::Mat _rotation_matrix; + cv::Mat _translation_matrix; + cv::Mat _normalized_rotation_matrix; + cv::Mat _landmarks_3d_matrix; + cv::Mat _normalized_landmarks_3d_matrix; + cv::Mat _projection_matrix; std::vector _pose_axes_3d; std::vector _pose_axes; + cv::Point3d _center_point; + cv::Point3d _gaze_vector; bool _isInitialized; + FaceShapeModelManager _faceShapeModelmgr; + bool validRotationMatrix(cv::Mat &matrix); cv::Vec3f rotationMatrixToEulerAngles(cv::Mat &matrix); double calculateReProjectectionError(); void print3x3Matrix(cv::Mat &matrix, std::string name); void print3x1Matrix(cv::Mat &matrix, std::string name); void printVec3f(cv::Vec3f &vec, std::string name); + void rtvecTortMatrix(cv::Mat &rvec, cv::Mat &tvec); + void normalizedRotationMatrix(); + void calculateCenter(); + void calculateProjectionMatrix(); public: HeadPoseEstimator(const std::vector &components); ~HeadPoseEstimator(); - std::tuple estimate(const FaceResult &face); + cv::Mat& estimate(const FaceResult &face); std::vector &getPoseAxes(); std::vector &getFaceComponentsIndieces(); + GazeResultType denormalizeGazeAngle(GazeResultType& normalized_angle); }; } // smartpointer } // services diff --git a/services/focus_finder/res/camera_param_dummy.yaml b/services/focus_finder/res/camera_param_dummy.yaml index 80ae3bf..a1a6227 100755 --- a/services/focus_finder/res/camera_param_dummy.yaml +++ b/services/focus_finder/res/camera_param_dummy.yaml @@ -3,8 +3,8 @@ camera_matrix: !!opencv-matrix rows: 3 cols: 3 dt: f - data: [324.0, 0.0, 324.0, - 0.0, 324.0, 216.0, + data: [1280.0, 0.0, 640.0, + 0.0, 1280.0, 959.0, 0.0, 0.0, 1.0] dist_coeff: !!opencv-matrix rows: 5 diff --git a/services/focus_finder/src/FaceShapeModelManager.cpp b/services/focus_finder/src/FaceShapeModelManager.cpp index b9cdf2b..55e6f34 100644 --- a/services/focus_finder/src/FaceShapeModelManager.cpp +++ b/services/focus_finder/src/FaceShapeModelManager.cpp @@ -29,20 +29,15 @@ namespace services { namespace focusfinder { +FaceShapeModelManager::FaceShapeModelManager() +{} + FaceShapeModelManager::FaceShapeModelManager(const string &model_path) { if (model_path.empty()) throw invalid_argument("Invalid face shape model path"); loadModelFromFile(model_path); - _faceShapeIndices.insert( - make_pair >(FaceComponents::NOSE_RIDGE, vector { 27, 28, 29, 30 })); - _faceShapeIndices.insert( - make_pair >(FaceComponents::NOSE_BASE, vector { 31, 32, 33, 34, 35 })); - _faceShapeIndices.insert( - make_pair >(FaceComponents::LEFT_EYE_CORNERS, vector { 36, 39 })); - _faceShapeIndices.insert( - make_pair >(FaceComponents::RIGHT_EYE_CORNERS, vector { 42, 45 })); } FaceShapeModelManager::~FaceShapeModelManager() @@ -56,6 +51,20 @@ template T FaceShapeModelManager::ToNumber(const string &text) return number; } +void FaceShapeModelManager::creatFaceShapeIndex() +{ + _faceShapeIndices.insert( + make_pair >(FaceComponents::NOSE_RIDGE, vector { 27, 28, 29, 30 })); + _faceShapeIndices.insert( + make_pair >(FaceComponents::NOSE_BASE, vector { 31, 32, 33, 34, 35 })); + _faceShapeIndices.insert( + make_pair >(FaceComponents::RIGHT_EYE_CORNERS, vector { 36, 39 })); + _faceShapeIndices.insert( + make_pair >(FaceComponents::LEFT_EYE_CORNERS, vector { 42, 45 })); + _faceShapeIndices.insert( + make_pair >(FaceComponents::MOUTH_CORNERS, vector { 48, 54 })); +} + void FaceShapeModelManager::loadModelFromFile(const string &model_path) { fstream file(model_path, ios::in); @@ -68,23 +77,24 @@ void FaceShapeModelManager::loadModelFromFile(const string &model_path) _faceShape.resize(ToNumber(line)); for (auto &point : _faceShape) { getline(file, line); - point.x = -1.0f * ToNumber(line); + point.x = ToNumber(line) / 1000.0; } for (auto &point : _faceShape) { getline(file, line); - point.y = -1.0f * ToNumber(line); + point.y = ToNumber(line) / 1000.0; } for (auto &point : _faceShape) { getline(file, line); - point.z = ToNumber(line); + point.z = ToNumber(line) / 1000.0; } SINGLEO_LOGI("%zd landarks", _faceShape.size()); + creatFaceShapeIndex(); } -const vector &FaceShapeModelManager::getFaceShape() +const vector &FaceShapeModelManager::getFaceShape() { return _faceShape; } diff --git a/services/focus_finder/src/FocusFinder.cpp b/services/focus_finder/src/FocusFinder.cpp index 6dac3a1..8248630 100644 --- a/services/focus_finder/src/FocusFinder.cpp +++ b/services/focus_finder/src/FocusFinder.cpp @@ -179,16 +179,20 @@ void FocusFinder::performAsync() }); } -void FocusFinder::updateResult(const vector &axes) +void FocusFinder::updateResult(const GazeResultType& gaze) { FocusFinderResult result; - // TODO: Implement focus point calculation based on head axes. - result.focus = Point { .x = 0, .y = 0 }; + // TODO: Implement focus point calculation based on gaze - result.head_axes = axes; - if (!result.head_axes.empty()) { - result.is_focused = true; - } + if (gaze._points.empty()) + return; + + result.is_focused = true; + result.center = gaze._points[0]; + result.focus = gaze._points[1]; + + result.pitch = gaze._pitch; + result.yaw = gaze._yaw; if (_async_mode) _async_manager->pushOutput(result); @@ -207,7 +211,29 @@ void FocusFinder::getResultCnt(unsigned int *cnt) } void FocusFinder::getResultInt(unsigned int idx, std::string key, unsigned int *value) -{} +{ + transform(key.begin(), key.end(), key.begin(), ::toupper); + + SINGLEO_LOGD("given key = %s", key.c_str()); + + if (!isKeyValid(key)) + throw InvalidParameter("A given result key is inavlid."); + + switch (_result_keys[key]) { + case FocusFinderResultType::CX: + *value = static_cast(_result.center.x); + break; + case FocusFinderResultType::CY: + *value = static_cast(_result.center.y); + break; + case FocusFinderResultType::GX: + *value = static_cast(_result.focus.x); + break; + case FocusFinderResultType::GY: + *value = static_cast(_result.focus.y); + break; + } +} void FocusFinder::registerUserCallback(singleo_user_cb_t user_cb, void *user_data) { diff --git a/services/focus_finder/src/FocusMapper.cpp b/services/focus_finder/src/FocusMapper.cpp index c393fb5..3623f4f 100644 --- a/services/focus_finder/src/FocusMapper.cpp +++ b/services/focus_finder/src/FocusMapper.cpp @@ -17,6 +17,7 @@ #include "SingleoException.h" #include "SingleoLog.h" #include "FocusMapper.h" +#include using namespace std; using namespace cv; @@ -53,12 +54,8 @@ void FocusMapper::invoke(BaseDataType &data) _cv_image = cv::Mat(cv::Size(image_data.width, image_data.height), CV_MAKETYPE(CV_8U, 3), image_data.ptr).clone(); - line(_cv_image, Point2i(_result.head_axes[0].x, _result.head_axes[0].y), - Point2i(_result.head_axes[1].x, _result.head_axes[1].y), Scalar(0, 0, 255), 2); // x - red - line(_cv_image, Point2i(_result.head_axes[0].x, _result.head_axes[0].y), - Point2i(_result.head_axes[2].x, _result.head_axes[2].y), Scalar(0, 255, 0), 2); // y - green - line(_cv_image, Point2i(_result.head_axes[0].x, _result.head_axes[0].y), - Point2i(_result.head_axes[3].x, _result.head_axes[3].y), Scalar(255, 0, 0), 2); // z - blue + line(_cv_image, Point2i(_result.center.x, _result.center.y), + Point2i(_result.focus.x, _result.focus.y), Scalar(0, 0, 255), 2); // projected gaze vector _image_data.width = _cv_image.cols; _image_data.height = _cv_image.rows; diff --git a/services/focus_finder/src/GazeEstimator.cpp b/services/focus_finder/src/GazeEstimator.cpp index b80abfe..5c99701 100644 --- a/services/focus_finder/src/GazeEstimator.cpp +++ b/services/focus_finder/src/GazeEstimator.cpp @@ -19,6 +19,7 @@ #include "FaceDetector.h" #include "FaceLandmarker.h" #include "SingleoLog.h" +#include "InferenceTaskFactory.h" using namespace std; using namespace cv; @@ -36,14 +37,19 @@ GazeEstimator::GazeEstimator() const vector _faceComponents { FaceComponents::NOSE_RIDGE, FaceComponents::NOSE_BASE, FaceComponents::LEFT_EYE_CORNERS, - FaceComponents::RIGHT_EYE_CORNERS }; + FaceComponents::RIGHT_EYE_CORNERS, + FaceComponents::MOUTH_CORNERS }; _head_pose_estimator = make_unique(_faceComponents); + _gaze_estimator = + inference::InferenceTaskFactory::instance().create("MvInferenceTaskFactory")->createGazeEstimation(); + _gaze_estimator->configure(); + _gaze_estimator->prepare(); } GazeEstimator::~GazeEstimator() {} -const vector GazeEstimator::run(BaseDataType &input) +const GazeResultType& GazeEstimator::run(BaseDataType &input) { FaceResult result; _face_tasker->handle(input, result); @@ -53,15 +59,28 @@ const vector GazeEstimator::run(BaseDataType &input) if (!result._landmarks.empty()) { SINGLEO_LOGI("landmarks %zu-%zu", result._landmarks.size(), result._landmarks[0].size()); - auto head_pose = _head_pose_estimator->estimate(result); - SINGLEO_LOGI("head pose: \n\t rotation(x,y,z): %.3f, %.3f, %.3f\n\t translation(x,y,z): %.3f, %.3f, %.3f\n", - get<0>(head_pose).x, get<0>(head_pose).y, get<0>(head_pose).z, get<1>(head_pose).x, - get<1>(head_pose).y, get<1>(head_pose).z); + auto projection_matrix = _head_pose_estimator->estimate(result); - _head_axes = _head_pose_estimator->getPoseAxes(); + auto image = dynamic_cast(input); + Mat cv_rgb_image = cv::Mat(Size(image.width, image.height), CV_MAKE_TYPE(CV_8U, image.byte_per_pixel), image.ptr); + Mat cv_bgr_image; + cvtColor(cv_rgb_image, cv_bgr_image, COLOR_RGB2BGR); + Mat cv_warped_image; + warpPerspective(cv_bgr_image, cv_warped_image, projection_matrix, Size(224, 224)); + + ImageDataType normalized_image; + normalized_image.width = cv_warped_image.cols; + normalized_image.height = cv_warped_image.rows; + normalized_image.byte_per_pixel = cv_warped_image.channels(); + normalized_image.ptr = cv_warped_image.data; + + _gaze_estimator->invoke(normalized_image); + auto normalized_gaze_angle = dynamic_cast(_gaze_estimator->result()); + + _result = _head_pose_estimator->denormalizeGazeAngle(normalized_gaze_angle); } - return _head_axes; + return _result; } } // focusfinder diff --git a/services/focus_finder/src/HeadPoseEstimator.cpp b/services/focus_finder/src/HeadPoseEstimator.cpp index 4bc755c..1af45e0 100644 --- a/services/focus_finder/src/HeadPoseEstimator.cpp +++ b/services/focus_finder/src/HeadPoseEstimator.cpp @@ -15,7 +15,7 @@ */ #include -#include "FaceShapeModelManager.h" +#include #include "CameraParamManager.h" #include "HeadPoseEstimator.h" #include "SingleoLog.h" @@ -31,18 +31,26 @@ namespace focusfinder { HeadPoseEstimator::HeadPoseEstimator(const vector &components) { - FaceShapeModelManager faceShapeModelmgr("/usr/share/singleo/pdm.txt"); - const vector faceShape = faceShapeModelmgr.getFaceShape(); + _faceShapeModelmgr.loadModelFromFile("/usr/share/singleo/pdm.txt"); + const vector faceShape = _faceShapeModelmgr.getFaceShape(); for (auto &component : components) { - for (auto &index : faceShapeModelmgr.getFaceShapeIndices(component)) { - _landmarks_3d.push_back(Point3f(faceShape[index].x, faceShape[index].y, faceShape[index].z)); + for (auto &index : _faceShapeModelmgr.getFaceShapeIndices(component)) { + _landmarks_3d.push_back(Point3d(faceShape[index].x, faceShape[index].y, faceShape[index].z)); _faceComponentIndices.push_back(index); } } + Mat landmarks_3d_matrix(Size(3,_landmarks_3d.size()), CV_64F); + for (size_t i = 0; i < _landmarks_3d.size(); i++) { + landmarks_3d_matrix.at(i,0) = _landmarks_3d[i].x; + landmarks_3d_matrix.at(i,1) = _landmarks_3d[i].y; + landmarks_3d_matrix.at(i,2) = _landmarks_3d[i].z; + } + _landmarks_3d_matrix = landmarks_3d_matrix.clone(); + // Nose base landmarks are 31, 32, 33, 34, 35 and 33 is the nose tip - int noseTipIndex = faceShapeModelmgr.getFaceShapeIndices(FaceComponents::NOSE_BASE)[2]; + int noseTipIndex = _faceShapeModelmgr.getFaceShapeIndices(FaceComponents::NOSE_BASE)[2]; _pose_axes_3d.push_back(Point3f(faceShape[noseTipIndex].x, faceShape[noseTipIndex].y, faceShape[noseTipIndex].z)); _pose_axes_3d.push_back( Point3f(faceShape[noseTipIndex].x - 50, faceShape[noseTipIndex].y, faceShape[noseTipIndex].z)); @@ -53,38 +61,39 @@ HeadPoseEstimator::HeadPoseEstimator(const vector &components) _pose_axes.resize(_pose_axes_3d.size()); _landmarks_2d.resize(_landmarks_3d.size()); - _rotation_vector = Mat(Size(3, 1), CV_32FC1); - _translation_vector = Mat(Size(3, 1), CV_32FC1); - - CameraParamManager cam_dummy("/usr/share/singleo/camera_param_rs2_640.yaml"); - - _camera_matrix = Mat::zeros(Size(3, 3), CV_32FC1); - tuple focalLength = cam_dummy.getFocalLengthXY(); - tuple principlePoint = cam_dummy.getPrinciplePointXY(); - _camera_matrix.at(0, 0) = get<0>(focalLength); - _camera_matrix.at(1, 1) = get<1>(focalLength); - _camera_matrix.at(0, 2) = get<0>(principlePoint); - _camera_matrix.at(1, 2) = get<1>(principlePoint); - _camera_matrix.at(2, 2) = 1.0; - - _camera_dist_coeff = Mat::zeros(Size(1, 5), CV_32FC1); - tuple distCoeff = cam_dummy.getDistortionCoeff(); - _camera_dist_coeff.at(0, 0) = get<0>(distCoeff); - _camera_dist_coeff.at(0, 1) = get<1>(distCoeff); - _camera_dist_coeff.at(0, 2) = get<2>(distCoeff); - _camera_dist_coeff.at(0, 3) = get<3>(distCoeff); - _camera_dist_coeff.at(0, 4) = get<4>(distCoeff); - - for (int r = 0; r < _camera_matrix.rows; ++r) { - for (int c = 0; c < _camera_matrix.cols; ++c) { - SINGLEO_LOGI("%.3f", _camera_matrix.at(r, c)); - } - } - for (int r = 0; r < _camera_dist_coeff.rows; ++r) { - for (int c = 0; c < _camera_dist_coeff.cols; ++c) { - SINGLEO_LOGI("%.3f", _camera_dist_coeff.at(r, c)); - } - } + _rotation_vector = Mat(Size(1, 3), CV_64F); + _translation_vector = Mat(Size(1, 3), CV_64F); + _translation_matrix = Mat(Size(_landmarks_3d.size(), 3), CV_64F); + + // TODO: When using image, cam_dummy should be updated based on the image size. + // CameraParamManager cam_dummy("/usr/share/singleo/camera_param_rs2_640.yaml"); + CameraParamManager cam_dummy("/usr/share/singleo/camera_param_dummy.yaml"); + + _camera_matrix = Mat::zeros(Size(3, 3), CV_64FC1); + tuple focalLength = cam_dummy.getFocalLengthXY(); + tuple principlePoint = cam_dummy.getPrinciplePointXY(); + _camera_matrix.at(0, 0) = get<0>(focalLength); + _camera_matrix.at(1, 1) = get<1>(focalLength); + _camera_matrix.at(0, 2) = get<0>(principlePoint); + _camera_matrix.at(1, 2) = get<1>(principlePoint); + _camera_matrix.at(2, 2) = 1.0; + _camera_matrix_inv = _camera_matrix.inv(); + + _camera_dist_coeff = Mat::zeros(Size(1, 5), CV_64FC1); + tuple distCoeff = cam_dummy.getDistortionCoeff(); + _camera_dist_coeff.at(0, 0) = get<0>(distCoeff); + _camera_dist_coeff.at(0, 1) = get<1>(distCoeff); + _camera_dist_coeff.at(0, 2) = get<2>(distCoeff); + _camera_dist_coeff.at(0, 3) = get<3>(distCoeff); + _camera_dist_coeff.at(0, 4) = get<4>(distCoeff); + + // TODO: This value is only valid when MPIIFACEGAZE. It may be changed when others are used. + _normalized_camera_matrix = Mat::zeros(Size(3,3), CV_64F); + _normalized_camera_matrix.at(0,0) = 1600.0; + _normalized_camera_matrix.at(0,2) = 112.0; + _normalized_camera_matrix.at(1,1) = 1600.0; + _normalized_camera_matrix.at(1,2) = 112.0; + _normalized_camera_matrix.at(2,2) = 1.0; } HeadPoseEstimator::~HeadPoseEstimator() @@ -114,31 +123,104 @@ void HeadPoseEstimator::printVec3f(Vec3f &vec, string name) SINGLEO_LOGD("%.3f, %.3f, %.3f", vec[0], vec[1], vec[2]); } -tuple HeadPoseEstimator::estimate(const FaceResult &face) +void HeadPoseEstimator::rtvecTortMatrix(cv::Mat &rvec, cv::Mat &tvec) +{ + // _rotation_vector(w x h): 1x3, _rotation_matrix is 3x3 + Rodrigues(_rotation_vector, _rotation_matrix); + + // _translation vector(w x h): 1 x 3, _translation matrix is 15 x3 + for (int i = 0; i < _translation_matrix.cols; ++i ) { + _translation_matrix.at(0, i) = _translation_vector.at(0, 0); + _translation_matrix.at(1, i) = _translation_vector.at(1, 0); + _translation_matrix.at(2, i) = _translation_vector.at(2, 0); + } +} + +void HeadPoseEstimator::calculateCenter() +{ + int nCenter = 0; + vector components = { FaceComponents::RIGHT_EYE_CORNERS, FaceComponents::LEFT_EYE_CORNERS, FaceComponents::MOUTH_CORNERS }; + _center_point.x = _center_point.y = _center_point.z = 0.0; + + for (const auto& component : components) { + for (auto &index : _faceShapeModelmgr.getFaceShapeIndices(component)) { + int mappedIndex = find(_faceComponentIndices.begin(), _faceComponentIndices.end(), index) - _faceComponentIndices.begin(); + _center_point.x += _normalized_landmarks_3d_matrix.at(mappedIndex, 0); + _center_point.y += _normalized_landmarks_3d_matrix.at(mappedIndex, 1); + _center_point.z += _normalized_landmarks_3d_matrix.at(mappedIndex, 2); + nCenter++; + } + } + + _center_point.x /= static_cast(nCenter); + _center_point.y /= static_cast(nCenter); + _center_point.z /= static_cast(nCenter); +} + +void HeadPoseEstimator::normalizedRotationMatrix() +{ + // calculate center point of normalized 3d landmarks + calculateCenter(); + + // calculate _normalized_rotation_matrix + Mat z_axis = Mat(_center_point/norm(Mat(_center_point), NORM_L2)); + Mat rot_x_axis = _rotation_matrix.col(0); + Mat y_tmp_axis = z_axis.cross(rot_x_axis); + Mat y_axis = y_tmp_axis/norm(y_tmp_axis, NORM_L2); + Mat x_tmp_axis = y_axis.cross(z_axis); + Mat x_axis = x_tmp_axis/norm(x_tmp_axis, NORM_L2); + + // (w x h): 1x3 -> 3x1 + vector array_axis = { x_axis.t(), y_axis.t(), z_axis.t() }; + vconcat(array_axis, _normalized_rotation_matrix); +} + +void HeadPoseEstimator::calculateProjectionMatrix() +{ + Mat scale_matrix = Mat::eye(Size(3,3), CV_64F); + scale_matrix.at(2,2) = 1.0 / norm(Mat(_center_point), NORM_L2); + + Mat conversion_matrix; + // conversion_matrix = scale_matrix @ normalized_rotatio_matrix + gemm(scale_matrix, _normalized_rotation_matrix, 1.0, noArray(), 0.0, conversion_matrix, 0); + + // _projection_matrix = conversion_matrix @ _camera_matrix_inv + gemm(conversion_matrix, _camera_matrix_inv, 1.0, noArray(), 0.0, _projection_matrix, 0); + // _projection_matrix = _normalized_camera_matrix @ _projection_matrix + gemm(_normalized_camera_matrix, _projection_matrix, 1.0, noArray(), 0.0, _projection_matrix, 0); + +} + +Mat& HeadPoseEstimator::estimate(const FaceResult &face) { // TODO: parallel??, c++17 structured binding?? // TODO: use multiple faces size_t idx = 0; for (auto &compIdx : _faceComponentIndices) { - _landmarks_2d[idx].x = face._landmarks[0][compIdx].x + face._rects[0].left; - _landmarks_2d[idx].y = face._landmarks[0][compIdx].y + face._rects[0].top; + _landmarks_2d[idx].x = static_cast(face._landmarks[0][compIdx].x + face._rects[0].left); + _landmarks_2d[idx].y = static_cast(face._landmarks[0][compIdx].y + face._rects[0].top); idx++; } + solvePnPRansac(_landmarks_3d, _landmarks_2d, _camera_matrix, _camera_dist_coeff, _rotation_vector, - _translation_vector, false, 100, 3.0F, 0.99, noArray(), SOLVEPNP_EPNP); + _translation_vector, false, 100, 3.0f, 0.99, noArray(), SOLVEPNP_EPNP); // estimate pose // https://docs.opencv.org/4.x/d9/d0c/group__calib3d.html#ga357634492a94efe8858d0ce1509da869 solvePnP(_landmarks_3d, _landmarks_2d, _camera_matrix, _camera_dist_coeff, _rotation_vector, _translation_vector, true, SOLVEPNP_ITERATIVE); - Rodrigues(_rotation_vector, _rotation_matrix); - auto rotation_eulerAngles = rotationMatrixToEulerAngles(_rotation_matrix); + // convert rotation and translation vector to matrixes + rtvecTortMatrix(_rotation_vector, _translation_vector); + + // normalized 3d landmark with rotation and translation matrix + gemm(_landmarks_3d_matrix, _rotation_matrix, 1.0, _translation_matrix, 1.0, _normalized_landmarks_3d_matrix, GEMM_3_T); + + normalizedRotationMatrix(); - return make_tuple(Point3f { rotation_eulerAngles[0], rotation_eulerAngles[1], rotation_eulerAngles[2] }, - Point3f { static_cast(_translation_vector.at(0)), - static_cast(_translation_vector.at(1)), - static_cast(_translation_vector.at(2)) }); + calculateProjectionMatrix(); + + return _projection_matrix; } vector &HeadPoseEstimator::getPoseAxes() @@ -204,6 +286,38 @@ Vec3f HeadPoseEstimator::rotationMatrixToEulerAngles(Mat &matrix) return Vec3f(x, y, z); } +GazeResultType HeadPoseEstimator::denormalizeGazeAngle(GazeResultType& normalized_angle) +{ + Mat gaze_vector(Size(3,1), CV_64F); + gaze_vector.at(0,0) = -1.0 * cos(normalized_angle._pitch) * sin(normalized_angle._yaw); + gaze_vector.at(0,1) = -1.0 * sin(normalized_angle._pitch); + gaze_vector.at(0,2) = -1.0 * cos(normalized_angle._pitch) * cos(normalized_angle._yaw); + + Mat denormalized_gaze_vector; + gemm(gaze_vector, _normalized_rotation_matrix, 1.0, noArray(), 1.0, denormalized_gaze_vector, 0); + + _gaze_vector.x = denormalized_gaze_vector.at(0,0); + _gaze_vector.y = denormalized_gaze_vector.at(0,1); + _gaze_vector.z = denormalized_gaze_vector.at(0,2); + + GazeResultType gaze_angle; + gaze_angle._pitch = static_cast(asin(-_gaze_vector.y)) * 180.f / M_PI; + gaze_angle._yaw = static_cast(atan2(-_gaze_vector.x, -_gaze_vector.z)) * 180.f / M_PI; + + vector points3d = { _center_point, _center_point + 0.05 * _gaze_vector }; + vector points2d; + + projectPoints(points3d, Mat::zeros(Size(1,3), CV_64F), Mat::zeros(Size(1,3), CV_64F), _camera_matrix, _camera_dist_coeff, points2d); + + Point pt0 = { .x = static_cast(points2d[0].x), .y = static_cast(points2d[0].y) }; + gaze_angle._points.push_back(pt0); + + Point pt1 = { .x = static_cast(points2d[1].x), .y = static_cast(points2d[1].y) }; + gaze_angle._points.push_back(pt1); + + return gaze_angle; +} + } // focusfinder } // services } // singleo diff --git a/test/services/test_focusfinder.cpp b/test/services/test_focusfinder.cpp index 47b6554..9805d02 100644 --- a/test/services/test_focusfinder.cpp +++ b/test/services/test_focusfinder.cpp @@ -24,7 +24,7 @@ #include "singleo_native_capi.h" #include "singleo_error.h" -#define IMG_FACE TEST_RES_PATH "/usr/share/capi-media-vision/res/inference/images/faceDetection.jpg" +#define IMG_FACE TEST_RES_PATH "/usr/share/capi-media-vision/res/inference/images/headpose_sample00.jpg" using namespace testing; using namespace std; @@ -53,6 +53,25 @@ TEST(FocusFinderTest, PerformWithImageFileShouldBeOK) ret = singleo_service_perform(handle); ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + unsigned int cnt = 0; + ret = singleo_service_get_result_cnt(handle, &cnt); + ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + + for (unsigned int idx = 0; idx < cnt; ++idx) { + unsigned int cx, cy, gx, gy; + + ret = singleo_service_get_result_int(handle, idx, "center_x", &cx); + ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + ret = singleo_service_get_result_int(handle, idx, "center_y", &cy); + ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + ret = singleo_service_get_result_int(handle, idx, "gaze_x", &gx); + ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + ret = singleo_service_get_result_int(handle, idx, "gaze_y", &gy); + ASSERT_EQ(ret, SINGLEO_ERROR_NONE); + + cout << "Result: from (" << cx << " , " << cy << ") to (" << gx << " , " << gy << ")" << endl; + } + ret = singleo_service_destroy(handle); ASSERT_EQ(ret, SINGLEO_ERROR_NONE); } \ No newline at end of file