From 2cf6383d9a9e3e3e3e3875fc47fe1616ccffa731 Mon Sep 17 00:00:00 2001
From: Tae-Young Chung <ty83.chung@samsung.com>
Date: Thu, 2 Apr 2020 14:10:56 +0900
Subject: [PATCH] mv_inference: add post-process step to
 GetObjectDetectionResults()

To handle model data (SSD) which doesn't have post-process layer,
add the post process step.
SSD generally provides an one output with 1x1xNx7 shape and the last dimension
includes 'image id, class id, confidence, coordinates(left-top (x,y),
and right-bottom(x,y))'

Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
---
 mv_inference/inference/src/Inference.cpp         | 110 ++++++++++++++++++++--
 test/testsuites/inference/inference_test_suite.c | 115 +++++++++++++++++++++--
 2 files changed, 210 insertions(+), 15 deletions(-)
diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp
index 057da62..3585300 100755
--- a/mv_inference/inference/src/Inference.cpp
+++ b/mv_inference/inference/src/Inference.cpp
@@ -1013,12 +1013,61 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult
 		return ret;
 	}
 
+	// In case of object detection,
+	// a model may apply post-process but others may not.
+	// Thus, those cases should be hanlded separately.
 	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
 	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
-	float* boxes = reinterpret_cast<float*>(inferResults[0]);
-	float* classes = reinterpret_cast<float*>(inferResults[1]);
-	float* scores = reinterpret_cast<float*>(inferResults[2]);
-	int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+	LOGI("inferResults size: %d", inferResults.size());
+
+	float* boxes = nullptr;
+	float* classes = nullptr;
+	float* scores = nullptr;
+	int number_of_detections = 0;
+
+	cv::Mat cvScores, cvClasses, cvBoxes;
+	if (outputData.dimInfo.size() == 1) {
+		// there is no way to know how many objects are detect unless the number of objects aren't
+		// provided. In the case, each backend should provide the number of results manually.
+		// For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+		// written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
+		// indicats the image id. But it is useless if a batch mode isn't supported.
+		// So, use the 1st of 7.
+
+		number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+		cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+		// boxes
+		cv::Mat cvLeft = cvOutputData.col(3).clone();
+		cv::Mat cvTop = cvOutputData.col(4).clone();
+		cv::Mat cvRight = cvOutputData.col(5).clone();
+		cv::Mat cvBottom = cvOutputData.col(6).clone();
+
+		cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+		cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+		LOGI("cvBoxes size: %d x %d", cvBoxes.size[0], cvBoxes.size[1]);
+		LOGI("0: %f, %f, %f, %f", cvBoxes.at<float>(0,0), cvBoxes.at<float>(0,1), cvBoxes.at<float>(0,2),cvBoxes.at<float>(0,3));
+
+		// classes
+		cvClasses = cvOutputData.col(1).clone();
+
+		// scores
+		cvScores = cvOutputData.col(2).clone();
+
+		boxes = cvBoxes.ptr<float>(0);
+		classes = cvClasses.ptr<float>(0);
+		scores = cvScores.ptr<float>(0);
+
+	} else {
+		boxes = reinterpret_cast<float*>(inferResults[0]);
+		classes = reinterpret_cast<float*>(inferResults[1]);
+		scores = reinterpret_cast<float*>(inferResults[2]);
+		number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+	}
+
 
 	LOGI("number_of_detections = %d", number_of_detections);
 
@@ -1068,14 +1117,59 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
 		return ret;
 	}
 
+	// In case of object detection,
+	// a model may apply post-process but others may not.
+	// Thus, those cases should be hanlded separately.
 	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
 	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+	LOGI("inferResults size: %d", inferResults.size());
+
+	float* boxes = nullptr;
+	float* classes = nullptr;
+	float* scores = nullptr;
+	int number_of_detections = 0;
+
+	cv::Mat cvScores, cvClasses, cvBoxes;
+	if (outputData.dimInfo.size() == 1) {
+
+		// there is no way to know how many objects are detect unless the number of objects aren't
+		// provided. In the case, each backend should provide the number of results manually.
+		// For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+		// written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7 
+		// indicats the image id. But it is useless if a batch mode isn't supported. 
+		// So, use the 1st of 7.
+
+		number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+		cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+		// boxes
+		cv::Mat cvTop = cvOutputData.col(6).clone();
+		cv::Mat cvLeft = cvOutputData.col(3).clone();
+		cv::Mat cvBottom = cvOutputData.col(4).clone();
+		cv::Mat cvRight = cvOutputData.col(5).clone();
 
-	float* boxes = reinterpret_cast<float*>(inferResults[0]);
-	float* classes = reinterpret_cast<float*>(inferResults[1]);
-	float* scores = reinterpret_cast<float*>(inferResults[2]);
+		cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+		cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+		// classes
+		cvClasses = cvOutputData.col(1).clone();
+
+		// scores
+		cvScores = cvOutputData.col(2).clone();
+
+		boxes = cvBoxes.ptr<float>(0);
+		classes = cvClasses.ptr<float>(0);
+		scores = cvScores.ptr<float>(0);
+
+	} else {
+		boxes = reinterpret_cast<float*>(inferResults[0]);
+		classes = reinterpret_cast<float*>(inferResults[1]);
+		scores = reinterpret_cast<float*>(inferResults[2]);
+		number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+	}
 
-	int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
 	int left, top, right, bottom;
 	cv::Rect loc;
 
diff --git a/test/testsuites/inference/inference_test_suite.c b/test/testsuites/inference/inference_test_suite.c
index 570b2b7..d984e38 100644
--- a/test/testsuites/inference/inference_test_suite.c
+++ b/test/testsuites/inference/inference_test_suite.c
@@ -45,6 +45,10 @@
 #define OD_LABEL_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_label.txt"
 #define OD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_tflite_model.tflite"
 
+#define OD_OPENCV_LABEL_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_label_mobilenetv1ssd.txt"
+#define OD_OPENCV_WEIGHT_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel"
+#define OD_OPENCV_CONFIG_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt"
+
 //Face Detection
 #define FD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_tflite_model1.tflite"
 
@@ -1119,6 +1123,89 @@ int perform_tflite_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
     return err;
 }
 
+int perform_opencv_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
+{
+    int err = MEDIA_VISION_ERROR_NONE;
+
+    mv_engine_config_h handle = NULL;
+    err = mv_create_engine_config(&handle);
+    if (err != MEDIA_VISION_ERROR_NONE) {
+        printf("Fail to create engine configuration handle.\n");
+        if (handle) {
+            int err2 = mv_destroy_engine_config(handle);
+            if (err2 != MEDIA_VISION_ERROR_NONE) {
+                printf("Fail to destroy engine cofniguration.\n");
+            }
+        }
+        return err;
+    }
+
+
+    char *inputNodeName = "data";
+    char *outputNodeName[1] = {"detection_out"};
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_WEIGHT_FILE_PATH,
+                        OD_OPENCV_WEIGHT_CAFFE_PATH);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_CONFIGURATION_FILE_PATH,
+                        OD_OPENCV_CONFIG_CAFFE_PATH);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_DATA_TYPE,
+                        MV_INFERENCE_DATA_FLOAT32);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_USER_FILE_PATH,
+                        OD_OPENCV_LABEL_CAFFE_PATH);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_MEAN_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_STD_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_CONFIDENCE_THRESHOLD,
+                        0.3);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_BACKEND_TYPE,
+                        MV_INFERENCE_BACKEND_OPENCV);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_TARGET_TYPE,
+                        MV_INFERENCE_TARGET_CPU);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_WIDTH,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_HEIGHT,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_CHANNELS,
+                        3);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_INPUT_NODE_NAME,
+                        inputNodeName);
+
+    mv_engine_config_set_array_string_attribute(handle,
+                        MV_INFERENCE_OUTPUT_NODE_NAMES,
+                        outputNodeName,
+                        1);
+
+    *engine_cfg = handle;
+    return err;
+}
+
+
 int perform_armnn_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
 {
     int err = MEDIA_VISION_ERROR_NONE;
@@ -1203,9 +1290,10 @@ int perform_object_detection()
     int err = MEDIA_VISION_ERROR_NONE;
 
     int sel_opt = 0;
-    const int options[5] = {1, 2, 3, 4, 5, 6};
-    const *names[5] = { "Configuration",
+    const int options[7] = {1, 2, 3, 4, 5, 6, 7};
+    const char *names[7] = { "Configuration",
                         "TFLITE(CPU) + MobileNetV1+SSD",
+                        "OPENCV(CPU) + MobileNetV1+SSD",
                         "ARMNN(CPU) + MobileNetV1+SSD",
                         "Prepare",
                         "Run",
@@ -1216,7 +1304,7 @@ int perform_object_detection()
     mv_source_h mvSource = NULL;
 
     while(sel_opt == 0) {
-        sel_opt = show_menu("Select Action:", options, names, 5);
+        sel_opt = show_menu("Select Action:", options, names, 7);
         switch (sel_opt) {
         case 1:
         {
@@ -1244,21 +1332,34 @@ int perform_object_detection()
             err = perform_tflite_mobilenetv1ssd_config(&engine_cfg);
         }
             break;
+
         case 3:
         {
-            //perform ARMNN MobileSSD config
+            //perform OpenCV MobileSSD config
             if (engine_cfg) {
                 int err2 = mv_destroy_engine_config(engine_cfg);
                 if (err2 != MEDIA_VISION_ERROR_NONE)
                     printf("Fail to destroy engine_cfg [err:%i]\n", err2);
             }
 
-            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+            err = perform_opencv_mobilenetv1ssd_config(&engine_cfg);
         }
             break;
 
         case 4:
         {
+            //perform ARMNN MobileSSD config
+            if (engine_cfg) {
+                int err2 = mv_destroy_engine_config(engine_cfg);
+                if (err2 != MEDIA_VISION_ERROR_NONE)
+                    printf("Fail to destroy engine_cfg [err:%i]\n", err2);
+            }
+
+            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+        }
+            break;
+        case 5:
+        {
             // create - configure - prepare
             if (infer) {
                 int err2 = mv_inference_destroy(infer);
@@ -1290,7 +1391,7 @@ int perform_object_detection()
             }
         }
             break;
-        case 5:
+        case 6:
         {
             if (mvSource) {
                 int err2 = mv_destroy_source(mvSource);
@@ -1338,7 +1439,7 @@ int perform_object_detection()
 
         }
             break;
-        case 6:
+        case 7:
         {
             //perform destroy
             if (engine_cfg) {
-- 
2.7.4