mv_inference: add post-process step to GetObjectDetectionResults()
authorTae-Young Chung <ty83.chung@samsung.com>
Thu, 2 Apr 2020 05:10:56 +0000 (14:10 +0900)
committerInki Dae <inki.dae@samsung.com>
Tue, 14 Apr 2020 00:42:19 +0000 (09:42 +0900)
To handle model data (SSD) which doesn't have post-process layer,
add the post process step.
SSD generally provides an one output with 1x1xNx7 shape and the last dimension
includes 'image id, class id, confidence, coordinates(left-top (x,y),
and right-bottom(x,y))'

Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
mv_inference/inference/src/Inference.cpp
test/testsuites/inference/inference_test_suite.c

index 057da62..3585300 100755 (executable)
@@ -1013,12 +1013,61 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult
                return ret;
        }
 
+       // In case of object detection,
+       // a model may apply post-process but others may not.
+       // Thus, those cases should be hanlded separately.
        std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
        std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
-       float* boxes = reinterpret_cast<float*>(inferResults[0]);
-       float* classes = reinterpret_cast<float*>(inferResults[1]);
-       float* scores = reinterpret_cast<float*>(inferResults[2]);
-       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       LOGI("inferResults size: %d", inferResults.size());
+
+       float* boxes = nullptr;
+       float* classes = nullptr;
+       float* scores = nullptr;
+       int number_of_detections = 0;
+
+       cv::Mat cvScores, cvClasses, cvBoxes;
+       if (outputData.dimInfo.size() == 1) {
+               // there is no way to know how many objects are detect unless the number of objects aren't
+               // provided. In the case, each backend should provide the number of results manually.
+               // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+               // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
+               // indicats the image id. But it is useless if a batch mode isn't supported.
+               // So, use the 1st of 7.
+
+               number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+               cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+               // boxes
+               cv::Mat cvLeft = cvOutputData.col(3).clone();
+               cv::Mat cvTop = cvOutputData.col(4).clone();
+               cv::Mat cvRight = cvOutputData.col(5).clone();
+               cv::Mat cvBottom = cvOutputData.col(6).clone();
+
+               cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+               cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+               LOGI("cvBoxes size: %d x %d", cvBoxes.size[0], cvBoxes.size[1]);
+               LOGI("0: %f, %f, %f, %f", cvBoxes.at<float>(0,0), cvBoxes.at<float>(0,1), cvBoxes.at<float>(0,2),cvBoxes.at<float>(0,3));
+
+               // classes
+               cvClasses = cvOutputData.col(1).clone();
+
+               // scores
+               cvScores = cvOutputData.col(2).clone();
+
+               boxes = cvBoxes.ptr<float>(0);
+               classes = cvClasses.ptr<float>(0);
+               scores = cvScores.ptr<float>(0);
+
+       } else {
+               boxes = reinterpret_cast<float*>(inferResults[0]);
+               classes = reinterpret_cast<float*>(inferResults[1]);
+               scores = reinterpret_cast<float*>(inferResults[2]);
+               number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       }
+
 
        LOGI("number_of_detections = %d", number_of_detections);
 
@@ -1068,14 +1117,59 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
                return ret;
        }
 
+       // In case of object detection,
+       // a model may apply post-process but others may not.
+       // Thus, those cases should be hanlded separately.
        std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       LOGI("inferDimInfo size: %d", outputData.dimInfo.size());
+
        std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+       LOGI("inferResults size: %d", inferResults.size());
+
+       float* boxes = nullptr;
+       float* classes = nullptr;
+       float* scores = nullptr;
+       int number_of_detections = 0;
+
+       cv::Mat cvScores, cvClasses, cvBoxes;
+       if (outputData.dimInfo.size() == 1) {
+
+               // there is no way to know how many objects are detect unless the number of objects aren't
+               // provided. In the case, each backend should provide the number of results manually.
+               // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+               // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7 
+               // indicats the image id. But it is useless if a batch mode isn't supported. 
+               // So, use the 1st of 7.
+
+               number_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+               cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]);
+
+               // boxes
+               cv::Mat cvTop = cvOutputData.col(6).clone();
+               cv::Mat cvLeft = cvOutputData.col(3).clone();
+               cv::Mat cvBottom = cvOutputData.col(4).clone();
+               cv::Mat cvRight = cvOutputData.col(5).clone();
 
-       float* boxes = reinterpret_cast<float*>(inferResults[0]);
-       float* classes = reinterpret_cast<float*>(inferResults[1]);
-       float* scores = reinterpret_cast<float*>(inferResults[2]);
+               cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight };
+               cv::hconcat(cvBoxElems, 4, cvBoxes);
+
+               // classes
+               cvClasses = cvOutputData.col(1).clone();
+
+               // scores
+               cvScores = cvOutputData.col(2).clone();
+
+               boxes = cvBoxes.ptr<float>(0);
+               classes = cvClasses.ptr<float>(0);
+               scores = cvScores.ptr<float>(0);
+
+       } else {
+               boxes = reinterpret_cast<float*>(inferResults[0]);
+               classes = reinterpret_cast<float*>(inferResults[1]);
+               scores = reinterpret_cast<float*>(inferResults[2]);
+               number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       }
 
-       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
        int left, top, right, bottom;
        cv::Rect loc;
 
index 570b2b7..d984e38 100644 (file)
 #define OD_LABEL_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_label.txt"
 #define OD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_tflite_model.tflite"
 
+#define OD_OPENCV_LABEL_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_label_mobilenetv1ssd.txt"
+#define OD_OPENCV_WEIGHT_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel"
+#define OD_OPENCV_CONFIG_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt"
+
 //Face Detection
 #define FD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_tflite_model1.tflite"
 
@@ -1119,6 +1123,89 @@ int perform_tflite_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
     return err;
 }
 
+int perform_opencv_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
+{
+    int err = MEDIA_VISION_ERROR_NONE;
+
+    mv_engine_config_h handle = NULL;
+    err = mv_create_engine_config(&handle);
+    if (err != MEDIA_VISION_ERROR_NONE) {
+        printf("Fail to create engine configuration handle.\n");
+        if (handle) {
+            int err2 = mv_destroy_engine_config(handle);
+            if (err2 != MEDIA_VISION_ERROR_NONE) {
+                printf("Fail to destroy engine cofniguration.\n");
+            }
+        }
+        return err;
+    }
+
+
+    char *inputNodeName = "data";
+    char *outputNodeName[1] = {"detection_out"};
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_WEIGHT_FILE_PATH,
+                        OD_OPENCV_WEIGHT_CAFFE_PATH);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_CONFIGURATION_FILE_PATH,
+                        OD_OPENCV_CONFIG_CAFFE_PATH);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_DATA_TYPE,
+                        MV_INFERENCE_DATA_FLOAT32);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_MODEL_USER_FILE_PATH,
+                        OD_OPENCV_LABEL_CAFFE_PATH);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_MEAN_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_MODEL_STD_VALUE,
+                        127.5);
+
+    mv_engine_config_set_double_attribute(handle,
+                        MV_INFERENCE_CONFIDENCE_THRESHOLD,
+                        0.3);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_BACKEND_TYPE,
+                        MV_INFERENCE_BACKEND_OPENCV);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_TARGET_TYPE,
+                        MV_INFERENCE_TARGET_CPU);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_WIDTH,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_HEIGHT,
+                        300);
+
+    mv_engine_config_set_int_attribute(handle,
+                        MV_INFERENCE_INPUT_TENSOR_CHANNELS,
+                        3);
+
+    mv_engine_config_set_string_attribute(handle,
+                        MV_INFERENCE_INPUT_NODE_NAME,
+                        inputNodeName);
+
+    mv_engine_config_set_array_string_attribute(handle,
+                        MV_INFERENCE_OUTPUT_NODE_NAMES,
+                        outputNodeName,
+                        1);
+
+    *engine_cfg = handle;
+    return err;
+}
+
+
 int perform_armnn_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg)
 {
     int err = MEDIA_VISION_ERROR_NONE;
@@ -1203,9 +1290,10 @@ int perform_object_detection()
     int err = MEDIA_VISION_ERROR_NONE;
 
     int sel_opt = 0;
-    const int options[5] = {1, 2, 3, 4, 5, 6};
-    const *names[5] = { "Configuration",
+    const int options[7] = {1, 2, 3, 4, 5, 6, 7};
+    const char *names[7] = { "Configuration",
                         "TFLITE(CPU) + MobileNetV1+SSD",
+                        "OPENCV(CPU) + MobileNetV1+SSD",
                         "ARMNN(CPU) + MobileNetV1+SSD",
                         "Prepare",
                         "Run",
@@ -1216,7 +1304,7 @@ int perform_object_detection()
     mv_source_h mvSource = NULL;
 
     while(sel_opt == 0) {
-        sel_opt = show_menu("Select Action:", options, names, 5);
+        sel_opt = show_menu("Select Action:", options, names, 7);
         switch (sel_opt) {
         case 1:
         {
@@ -1244,21 +1332,34 @@ int perform_object_detection()
             err = perform_tflite_mobilenetv1ssd_config(&engine_cfg);
         }
             break;
+
         case 3:
         {
-            //perform ARMNN MobileSSD config
+            //perform OpenCV MobileSSD config
             if (engine_cfg) {
                 int err2 = mv_destroy_engine_config(engine_cfg);
                 if (err2 != MEDIA_VISION_ERROR_NONE)
                     printf("Fail to destroy engine_cfg [err:%i]\n", err2);
             }
 
-            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+            err = perform_opencv_mobilenetv1ssd_config(&engine_cfg);
         }
             break;
 
         case 4:
         {
+            //perform ARMNN MobileSSD config
+            if (engine_cfg) {
+                int err2 = mv_destroy_engine_config(engine_cfg);
+                if (err2 != MEDIA_VISION_ERROR_NONE)
+                    printf("Fail to destroy engine_cfg [err:%i]\n", err2);
+            }
+
+            err = perform_armnn_mobilenetv1ssd_config(&engine_cfg);
+        }
+            break;
+        case 5:
+        {
             // create - configure - prepare
             if (infer) {
                 int err2 = mv_inference_destroy(infer);
@@ -1290,7 +1391,7 @@ int perform_object_detection()
             }
         }
             break;
-        case 5:
+        case 6:
         {
             if (mvSource) {
                 int err2 = mv_destroy_source(mvSource);
@@ -1338,7 +1439,7 @@ int perform_object_detection()
 
         }
             break;
-        case 6:
+        case 7:
         {
             //perform destroy
             if (engine_cfg) {