mv_inference: add post-process step to GetObjectDetectionResults()

author Tae-Young Chung <ty83.chung@samsung.com>

Thu, 2 Apr 2020 05:10:56 +0000 (14:10 +0900)

committer Inki Dae <inki.dae@samsung.com>

Tue, 14 Apr 2020 00:42:19 +0000 (09:42 +0900)
author Tae-Young Chung <ty83.chung@samsung.com>
Thu, 2 Apr 2020 05:10:56 +0000 (14:10 +0900)
committer Inki Dae <inki.dae@samsung.com>
Tue, 14 Apr 2020 00:42:19 +0000 (09:42 +0900)
diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp

index 057da62..3585300 100755 (executable)
--- a/mv_inference/inference/src/Inference.cpp
+++ b/mv_inference/inference/src/Inference.cpp
@@ -1013,12 +1013,61 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult
                 return ret;
         }
  
+       // In case of object detection,
+       // a model may apply post-process but others may not.
+       // Thus, those cases should be hanlded separately.
         std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
         std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
-       float* boxes = reinterpret_cast<float*>(inferResults[0]);
-       float* classes = reinterpret_cast<float*>(inferResults[1]);
-       float* scores = reinterpret_cast<float*>(inferResults[2]);
-       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       LOGI("inferResults size: %d", inferResults.size());
+
+       float* boxes = nullptr;
+       float* classes = nullptr;
+       float* scores = nullptr;
+       int number_of_detections = 0;
+
+       cv::Mat cvScores, cvClasses, cvBoxes;
+       if (outputData.dimInfo.size() == 1) {
+               // there is no way to know how many objects are detect unless the number of objects aren't
+               // provided. In the case, each backend should provide the number of results manually.
+               // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+               // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
+               // indicats the image id. But it is useless if a batch mode isn't supported.
+               // So, use the 1st of 7.
+
+               number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+               cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+               // boxes
+               cv::Mat cvLeft = cvOutputData.col(3).clone();
+               cv::Mat cvTop = cvOutputData.col(4).clone();
+               cv::Mat cvRight = cvOutputData.col(5).clone();
+               cv::Mat cvBottom = cvOutputData.col(6).clone();
+
+               cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+               cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+               LOGI("cvBoxes size: %d x %d", cvBoxes.size[0], cvBoxes.size[1]);
+               LOGI("0: %f, %f, %f, %f", cvBoxes.at<float>(0,0), cvBoxes.at<float>(0,1), cvBoxes.at<float>(0,2),cvBoxes.at<float>(0,3));
+
+               // classes
+               cvClasses = cvOutputData.col(1).clone();
+
+               // scores
+               cvScores = cvOutputData.col(2).clone();
+
+               boxes = cvBoxes.ptr<float>(0);
+               classes = cvClasses.ptr<float>(0);
+               scores = cvScores.ptr<float>(0);
+
+       } else {
+               boxes = reinterpret_cast<float*>(inferResults[0]);
+               classes = reinterpret_cast<float*>(inferResults[1]);
+               scores = reinterpret_cast<float*>(inferResults[2]);
+               number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       }
+
  
         LOGI("number_of_detections = %d", number_of_detections);
  
@@ -1068,14 +1117,59 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
                 return ret;
         }
  
+       // In case of object detection,
+       // a model may apply post-process but others may not.
+       // Thus, those cases should be hanlded separately.
         std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
         std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+       LOGI("inferResults size: %d", inferResults.size());
+
+       float* boxes = nullptr;
+       float* classes = nullptr;
+       float* scores = nullptr;
+       int number_of_detections = 0;
+
+       cv::Mat cvScores, cvClasses, cvBoxes;
+       if (outputData.dimInfo.size() == 1) {
+
+               // there is no way to know how many objects are detect unless the number of objects aren't
+               // provided. In the case, each backend should provide the number of results manually.
+               // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+               // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7 
+               // indicats the image id. But it is useless if a batch mode isn't supported. 
+               // So, use the 1st of 7.
+
+               number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+               cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+               // boxes
+               cv::Mat cvTop = cvOutputData.col(6).clone();
+               cv::Mat cvLeft = cvOutputData.col(3).clone();
+               cv::Mat cvBottom = cvOutputData.col(4).clone();
+               cv::Mat cvRight = cvOutputData.col(5).clone();
  
-       float* boxes = reinterpret_cast<float*>(inferResults[0]);
-       float* classes = reinterpret_cast<float*>(inferResults[1]);
-       float* scores = reinterpret_cast<float*>(inferResults[2]);
+               cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+               cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+               // classes
+               cvClasses = cvOutputData.col(1).clone();
+
+               // scores
+               cvScores = cvOutputData.col(2).clone();
+
+               boxes = cvBoxes.ptr<float>(0);
+               classes = cvClasses.ptr<float>(0);
+               scores = cvScores.ptr<float>(0);
+
+       } else {
+               boxes = reinterpret_cast<float*>(inferResults[0]);
+               classes = reinterpret_cast<float*>(inferResults[1]);
+               scores = reinterpret_cast<float*>(inferResults[2]);
+               number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       }
  
-       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
         int left, top, right, bottom;
         cv::Rect loc;
  
diff --git a/test/testsuites/inference/inference_test_suite.c b/test/testsuites/inference/inference_test_suite.c

index 570b2b7..d984e38 100644 (file)
--- a/test/testsuites/inference/inference_test_suite.c
+++ b/test/testsuites/inference/inference_test_suite.c
@@ -45,6 +45,10 @@
  #define OD_LABEL_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_label.txt"
  #define OD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_tflite_model.tflite"
  
+#define OD_OPENCV_LABEL_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_label_mobilenetv1ssd.txt"
+#define OD_OPENCV_WEIGHT_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel"
+#define OD_OPENCV_CONFIG_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt"
+
  //Face Detection
  #define FD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_tflite_model1.tflite"
  
@@ -1119,6 +1123,89 @@ int perform_tflite_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
      return err;
  }
  
+int perform_opencv_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
+{
+    int err = MEDIA_VISION_ERROR_NONE;
+
+    mv_engine_config_h handle = NULL;
+    err = mv_create_engine_config(&handle);
+    if (err != MEDIA_VISION_ERROR_NONE) {
+        printf("Fail to create engine configuration handle.\n");
+        if (handle) {
+            int err2 = mv_destroy_engine_config(handle);
+            if (err2 != MEDIA_VISION_ERROR_NONE) {
+                printf("Fail to destroy engine cofniguration.\n");
+            }
+        }
+        return err;
+    }
+
+
+    char *inputNodeName = "data";
+    char *outputNodeName[1] = {"detection_out"};
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_WEIGHT_FILE_PATH,
+                        OD_OPENCV_WEIGHT_CAFFE_PATH);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_CONFIGURATION_FILE_PATH,
+                        OD_OPENCV_CONFIG_CAFFE_PATH);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_DATA_TYPE,
+                        MV_INFERENCE_DATA_FLOAT32);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_USER_FILE_PATH,
+                        OD_OPENCV_LABEL_CAFFE_PATH);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_MEAN_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_STD_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_CONFIDENCE_THRESHOLD,
+                        0.3);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_BACKEND_TYPE,
+                        MV_INFERENCE_BACKEND_OPENCV);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_TARGET_TYPE,
+                        MV_INFERENCE_TARGET_CPU);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_WIDTH,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_HEIGHT,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_CHANNELS,
+                        3);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_INPUT_NODE_NAME,
+                        inputNodeName);
+
+    mv_engine_config_set_array_string_attribute(handle,
+                        MV_INFERENCE_OUTPUT_NODE_NAMES,
+                        outputNodeName,
+                        1);
+
+    *engine_cfg = handle;
+    return err;
+}
+
+
  int perform_armnn_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
  {
      int err = MEDIA_VISION_ERROR_NONE;
@@ -1203,9 +1290,10 @@ int perform_object_detection()
      int err = MEDIA_VISION_ERROR_NONE;
  
      int sel_opt = 0;
-    const int options[5] = {1, 2, 3, 4, 5, 6};
-    const *names[5] = { "Configuration",
+    const int options[7] = {1, 2, 3, 4, 5, 6, 7};
+    const char *names[7] = { "Configuration",
                          "TFLITE(CPU) + MobileNetV1+SSD",
+                        "OPENCV(CPU) + MobileNetV1+SSD",
                          "ARMNN(CPU) + MobileNetV1+SSD",
                          "Prepare",
                          "Run",
@@ -1216,7 +1304,7 @@ int perform_object_detection()
      mv_source_h mvSource = NULL;
  
      while(sel_opt == 0) {
-        sel_opt = show_menu("Select Action:", options, names, 5);
+        sel_opt = show_menu("Select Action:", options, names, 7);
          switch (sel_opt) {
          case 1:
          {
@@ -1244,21 +1332,34 @@ int perform_object_detection()
              err = perform_tflite_mobilenetv1ssd_config(&engine_cfg);
          }
              break;
+
          case 3:
          {
-            //perform ARMNN MobileSSD config
+            //perform OpenCV MobileSSD config
              if (engine_cfg) {
                  int err2 = mv_destroy_engine_config(engine_cfg);
                  if (err2 != MEDIA_VISION_ERROR_NONE)
                      printf("Fail to destroy engine_cfg [err:%i]\n", err2);
              }
  
-            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+            err = perform_opencv_mobilenetv1ssd_config(&engine_cfg);
          }
              break;
  
          case 4:
          {
+            //perform ARMNN MobileSSD config
+            if (engine_cfg) {
+                int err2 = mv_destroy_engine_config(engine_cfg);
+                if (err2 != MEDIA_VISION_ERROR_NONE)
+                    printf("Fail to destroy engine_cfg [err:%i]\n", err2);
+            }
+
+            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+        }
+            break;
+        case 5:
+        {
              // create - configure - prepare
              if (infer) {
                  int err2 = mv_inference_destroy(infer);
@@ -1290,7 +1391,7 @@ int perform_object_detection()
              }
          }
              break;
-        case 5:
+        case 6:
          {
              if (mvSource) {
                  int err2 = mv_destroy_source(mvSource);
@@ -1338,7 +1439,7 @@ int perform_object_detection()
  
          }
              break;
-        case 6:
+        case 7:
          {
              //perform destroy
              if (engine_cfg) {
author	Tae-Young Chung <ty83.chung@samsung.com>
	Thu, 2 Apr 2020 05:10:56 +0000 (14:10 +0900)
committer	Inki Dae <inki.dae@samsung.com>
	Tue, 14 Apr 2020 00:42:19 +0000 (09:42 +0900)
mv_inference/inference/src/Inference.cpp		patch \| blob \| history
test/testsuites/inference/inference_test_suite.c		patch \| blob \| history