mv_inference: Add hand gesture support
authorInki Dae <inki.dae@samsung.com>
Fri, 29 May 2020 05:51:18 +0000 (14:51 +0900)
committerInki Dae <inki.dae@samsung.com>
Fri, 29 May 2020 05:51:18 +0000 (14:51 +0900)
Change-Id: I9ff7a93eb73527a1c1288daaa914a5e229537264
Signed-off-by: Inki Dae <inki.dae@samsung.com>
include/mv_inference.h
mv_inference/inference/include/Inference.h
mv_inference/inference/include/mv_inference_open.h
mv_inference/inference/src/Inference.cpp
mv_inference/inference/src/mv_inference.c
mv_inference/inference/src/mv_inference_open.cpp

index 118d2db2fec9ff0d6d2757281b5e0cf0f7d734fc..5af4193091db287c131199b547c2be0e73483534 100644 (file)
@@ -636,6 +636,13 @@ typedef void (*mv_inference_pose_estimation_detected_cb)(
        const mv_point_s *locations,
        void *user_data);
 
+typedef void (*mv_inference_hand_detected_cb)(
+       mv_source_h source,
+       int number_of_hands,
+       const float *confidences,
+       const mv_rectangle_s *locations,
+       void *user_data);
+
 /**
  * @brief Performs facial landmarks detection on the @a source.
  * @details Use this function to launch facial landmark detection.
@@ -720,6 +727,46 @@ int mv_inference_pose_estimation_detect(
        mv_inference_pose_estimation_detected_cb detected_cb,
        void *user_data);
 
+/**
+ * @brief Performs hand detection on the @a source.
+ * @details Use this function to launch hand detection.
+ *          Each time when mv_inference_hand_detect() is
+ *          called, @a detected_cb will receive a list of hands and their locations
+ *          in the media source.
+ *
+ * @since_tizen 6.0
+ * @remarks This function is synchronous and may take considerable time to run.
+ *
+ * @param[in] source         The handle to the source of the media
+ * @param[in] infer          The handle to the inference
+ * @param[in] detected_cb    The callback which will be called for
+ *                           detecting hands on media source.
+ *                           This callback will receive the detection results.
+ * @param[in] user_data      The user data passed from the code where
+ *                           mv_inference_hand_detect() is invoked. This data will
+ *                           be accessible in @a detected_cb callback.
+ * @return @c 0 on success, otherwise a negative error value
+ * @retval #MEDIA_VISION_ERROR_NONE Successful
+ * @retval #MEDIA_VISION_ERROR_NOT_SUPPORTED Not supported
+ * @retval #MEDIA_VISION_ERROR_INVALID_PARAMETER Invalid parameter
+ * @retval #MEDIA_VISION_ERROR_INTERNAL          Internal error
+ * @retval #MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT Source colorspace
+ *                                                  isn't supported
+ *
+ * @pre Create a source handle by calling mv_create_source()
+ * @pre Create an inference handle by calling mv_inference_create()
+ * @pre Configure an inference handle by calling mv_inference_configure()
+ * @pre Prepare an inference by calling mv_inference_prepare()
+ * @post @a detected_cb will be called to provide detection results
+ *
+ * @see mv_inference_hand_detected_cb()
+ */
+int mv_inference_hand_detect(
+       mv_source_h source,
+       mv_inference_h infer,
+       mv_inference_hand_detected_cb detected_cb,
+       void *user_data);
+
 /**
  * @}
  */
index c2a7b2e3ededc91d6c6482573919994c2140e65e..4c126ce9b092ba0b52f92ea30bba3e4e35f713ca 100755 (executable)
@@ -65,6 +65,12 @@ typedef struct _PoseEstimationResults {
     std::vector<cv::Point> locations;
 } PoseEstimationResults;  /**< structure PoseEstimationResults */
 
+typedef struct _HandDetectionResults {
+    int number_of_hands;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> locations;
+} HandDetectionResults;  /**< structure HandDetectionResults */
+
 namespace mediavision {
 namespace inference {
 
@@ -310,6 +316,14 @@ public:
         */
        int GetPoseEstimationDetectionResults(PoseEstimationResults* results);
 
+       /**
+        * @brief       Gets the HandDetectioResults
+        *
+        * @since_tizen 6.0
+        * @return @c true on success, otherwise a negative error value
+        */
+       int GetHandDetectionResults(HandDetectionResults *detectionResults);
+
        int GetResults(std::vector<std::vector<int>>* dimInfo, std::vector<float*> *results);
 
        mv_engine_config_h GetEngineConfig(void) { return engine_config; }
index e3140524580737958b3e4be7d919460cf5f88e6c..3bdc559308e58db0b2d4e91de23d56c797f7ef67 100755 (executable)
@@ -533,6 +533,47 @@ int mv_inference_pose_estimation_detect_open(
        mv_inference_pose_estimation_detected_cb detected_cb,
        void *user_data);
 
+/**
+ * @brief Performs hand detection on the @a source
+ * @details Use this function to launch hand detection.
+ *          Each time when mv_inference_hand_detection is
+ *          called, @a detected_cb will receive  a list of hands and their locations
+ *          on the media source.
+ *
+ * @since_tizen 6.0
+ *
+ * @param [in] source         The handle to the source of the media
+ * @param [in] infer          The handle to the inference
+ * @param [in] detected_cb    The callback which will be called for
+ *                            detecting hands on media source.
+ *                            This callback will receive the detection results.
+ * @param [in] user_data      The user data passed from the code where
+ *                            @ref mv_inference_hand_detect() is invoked. This data will
+ *                            be accessible from @a detected_cb callback.
+ * @return @c 0 on success, otherwise a negative error value
+ * @retval #MEDIA_VISION_ERROR_NONE Successful
+ * @retval #MEDIA_VISION_ERROR_INVALID_PARAMETER Invalid parameter
+ * @retval #MEDIA_VISION_ERROR_INTERNAL          Internal error
+ * @retval #MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT Source colorspace
+ *                                                  isn't supported
+ * @retval #MEDIA_VISION_ERROR_OUT_OF_MEMORY Out of memory
+ * @retval #MEDIA_VISION_ERROR_NOT_SUPPORTED Not supported
+ *
+ * @pre Create a source handle by calling @ref mv_create_source()
+ * @pre Create an inference handle by calling @ref mv_inference_create()
+ * @pre Configure an inference handle by calling @ref mv_inference_configure()
+ * @pre Prepare an inference by calling @ref mv_inference_prepare()
+ * @post @a detected_cb will be called to process detection results
+ *
+ * @see mv_inference_hand_detected_cb
+ */
+int mv_inference_hand_detect_open(
+       mv_source_h source,
+       mv_inference_h infer,
+       mv_inference_hand_detected_cb detected_cb,
+       void *user_data);
+
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
index 5a12b350ce5eb509ebef856e1b55673e587c0a6a..7da59f1a2392cad141aa137845ef7e74990d8a3d 100755 (executable)
@@ -1290,5 +1290,102 @@ int Inference::GetPoseEstimationDetectionResults(PoseEstimationResults *detectio
        return MEDIA_VISION_ERROR_NONE;
 }
 
+int Inference::GetHandDetectionResults(HandDetectionResults *detectionResults)
+{
+       tensor_t outputData;
+
+       // Get inference result and contain it to outputData.
+       int ret = FillOutputResult(outputData);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to get output result.");
+               return ret;
+       }
+
+       // In case of object detection,
+       // a model may apply post-process but others may not.
+       // Thus, those cases should be hanlded separately.
+       std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       LOGI("inferDimInfo size: %zu", outputData.dimInfo.size());
+
+       std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+       LOGI("inferResults size: %zu", inferResults.size());
+
+       float* boxes = nullptr;
+       float* classes = nullptr;
+       float* scores = nullptr;
+       int number_of_detections = 0;
+
+       cv::Mat cvScores, cvClasses, cvBoxes;
+       if (outputData.dimInfo.size() == 1) {
+               // there is no way to know how many objects are detect unless the number of objects aren't
+               // provided. In the case, each backend should provide the number of results manually.
+               // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+               // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
+               // indicats the image id. But it is useless if a batch mode isn't supported.
+               // So, use the 1st of 7.
+
+               number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+               cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+               // boxes
+               cv::Mat cvLeft = cvOutputData.col(3).clone();
+               cv::Mat cvTop = cvOutputData.col(4).clone();
+               cv::Mat cvRight = cvOutputData.col(5).clone();
+               cv::Mat cvBottom = cvOutputData.col(6).clone();
+
+               cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+               cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+               // classes
+               cvClasses = cvOutputData.col(1).clone();
+
+               // scores
+               cvScores = cvOutputData.col(2).clone();
+
+               boxes = cvBoxes.ptr<float>(0);
+               classes = cvClasses.ptr<float>(0);
+               scores = cvScores.ptr<float>(0);
+
+       } else {
+               boxes = reinterpret_cast<float*>(inferResults[0]);
+               classes = reinterpret_cast<float*>(inferResults[1]);
+               scores = reinterpret_cast<float*>(inferResults[2]);
+               number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       }
+
+       int left, top, right, bottom;
+       cv::Rect loc;
+
+       HandDetectionResults results;
+       results.number_of_hands = 0;
+       for (int idx = 0; idx < number_of_detections; ++idx) {
+               if (scores[idx] < mThreshold)
+                       continue;
+
+               left =   (int)(boxes[idx*4 + 1] * mSourceSize.width);
+               top  =   (int)(boxes[idx*4 + 0] * mSourceSize.height);
+               right  = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+               bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+               loc.x = left;
+               loc.y = top;
+               loc.width = right -left + 1;
+               loc.height = bottom - top + 1;
+
+               results.confidences.push_back(scores[idx]);
+               results.locations.push_back(loc);
+               results.number_of_hands++;
+
+               LOGI("confidence:%f", scores[idx]);
+               LOGI("class: %f", classes[idx]);
+               LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx*4 + 1], boxes[idx*4 + 0], boxes[idx*4 + 3], boxes[idx*4 + 2]);
+               LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+       }
+
+       *detectionResults = results;
+       LOGE("Inference: GetHandDetectionResults: %d\n", results.number_of_hands);
+       return MEDIA_VISION_ERROR_NONE;
+}
+
 } /* Inference */
 } /* MediaVision */
index 56ca9993225d447991f135fe1590de55ee1bb93f..c08339c821385fac108ea91777a1dd30c6b744f3 100755 (executable)
@@ -343,4 +343,34 @@ int mv_inference_pose_estimation_detect(
 
        return ret;
 #endif
+}
+
+int mv_inference_hand_detect(
+       mv_source_h source,
+       mv_inference_h infer,
+       mv_inference_hand_detected_cb detected_cb,
+       void *user_data)
+{
+       MEDIA_VISION_SUPPORT_CHECK(__mv_inference_face_check_system_info_feature_supported());
+       MEDIA_VISION_INSTANCE_CHECK(source);
+       MEDIA_VISION_INSTANCE_CHECK(infer);
+       MEDIA_VISION_NULL_ARG_CHECK(detected_cb);
+
+       MEDIA_VISION_FUNCTION_ENTER();
+
+       int ret = MEDIA_VISION_ERROR_NONE;
+
+#ifdef MEDIA_VISION_INFERENCE_LICENCE_PORT
+       /*
+       ret = mv_inference_hand_detect_lic(source, infer, detected_cb, user_data);
+       */
+#else
+
+       ret = mv_inference_hand_detect_open(source, infer, detected_cb, user_data);
+
+       MEDIA_VISION_FUNCTION_LEAVE();
+
+       return ret;
+
+#endif
 }
\ No newline at end of file
index 57cdfa92fe2e462e85ba792d8e6dfb371ba0f932..2de002a2aea2d22c71bd5648fd64fbc6905b9479 100755 (executable)
@@ -853,3 +853,48 @@ int mv_inference_pose_estimation_detect_open(
 
        return ret;
 }
+
+int mv_inference_hand_detect_open(
+       mv_source_h source,
+       mv_inference_h infer,
+       mv_inference_hand_detected_cb detected_cb,
+       void *user_data)
+{
+       Inference *pInfer = static_cast<Inference *>(infer);
+
+       int ret = MEDIA_VISION_ERROR_NONE;
+       int numberOfOutputs = 0;
+       std::vector<mv_source_h> sources;
+       std::vector<mv_rectangle_s> rects;
+
+       sources.push_back(source);
+
+       ret = pInfer->Run(sources, rects);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to run inference");
+               return ret;
+       }
+
+       HandDetectionResults handDetectionResults;
+       ret = pInfer->GetHandDetectionResults(&handDetectionResults);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to get inference results");
+               return ret;
+       }
+
+       numberOfOutputs = handDetectionResults.number_of_hands;
+
+       float *confidences = handDetectionResults.confidences.data();
+       std::vector<mv_rectangle_s> locations(numberOfOutputs);
+
+       for (int n = 0; n < numberOfOutputs; ++n) {
+               locations[n].point.x = handDetectionResults.locations[n].x;
+               locations[n].point.y = handDetectionResults.locations[n].y;
+               locations[n].width = handDetectionResults.locations[n].width;
+               locations[n].height = handDetectionResults.locations[n].height;
+       }
+
+       detected_cb(source, numberOfOutputs, confidences, locations.data(), user_data);
+
+       return ret;
+}
\ No newline at end of file