From 2cf6383d9a9e3e3e3e3875fc47fe1616ccffa731 Mon Sep 17 00:00:00 2001 From: Tae-Young Chung Date: Thu, 2 Apr 2020 14:10:56 +0900 Subject: [PATCH] mv_inference: add post-process step to GetObjectDetectionResults() To handle model data (SSD) which doesn't have post-process layer, add the post process step. SSD generally provides an one output with 1x1xNx7 shape and the last dimension includes 'image id, class id, confidence, coordinates(left-top (x,y), and right-bottom(x,y))' Signed-off-by: Tae-Young Chung --- mv_inference/inference/src/Inference.cpp | 110 ++++++++++++++++++++-- test/testsuites/inference/inference_test_suite.c | 115 +++++++++++++++++++++-- 2 files changed, 210 insertions(+), 15 deletions(-) diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp index 057da62..3585300 100755 --- a/mv_inference/inference/src/Inference.cpp +++ b/mv_inference/inference/src/Inference.cpp @@ -1013,12 +1013,61 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult return ret; } + // In case of object detection, + // a model may apply post-process but others may not. + // Thus, those cases should be hanlded separately. std::vector> inferDimInfo(outputData.dimInfo); + LOGI("inferDimInfo size: %d", outputData.dimInfo.size()); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); - float* boxes = reinterpret_cast(inferResults[0]); - float* classes = reinterpret_cast(inferResults[1]); - float* scores = reinterpret_cast(inferResults[2]); - int number_of_detections = (int)(*reinterpret_cast(inferResults[3])); + LOGI("inferResults size: %d", inferResults.size()); + + float* boxes = nullptr; + float* classes = nullptr; + float* scores = nullptr; + int number_of_detections = 0; + + cv::Mat cvScores, cvClasses, cvBoxes; + if (outputData.dimInfo.size() == 1) { + // there is no way to know how many objects are detect unless the number of objects aren't + // provided. In the case, each backend should provide the number of results manually. + // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are + // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7 + // indicats the image id. But it is useless if a batch mode isn't supported. + // So, use the 1st of 7. + + number_of_detections = (int)(*reinterpret_cast(outputData.data[0])); + cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]); + + // boxes + cv::Mat cvLeft = cvOutputData.col(3).clone(); + cv::Mat cvTop = cvOutputData.col(4).clone(); + cv::Mat cvRight = cvOutputData.col(5).clone(); + cv::Mat cvBottom = cvOutputData.col(6).clone(); + + cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight }; + cv::hconcat(cvBoxElems, 4, cvBoxes); + + LOGI("cvBoxes size: %d x %d", cvBoxes.size[0], cvBoxes.size[1]); + LOGI("0: %f, %f, %f, %f", cvBoxes.at(0,0), cvBoxes.at(0,1), cvBoxes.at(0,2),cvBoxes.at(0,3)); + + // classes + cvClasses = cvOutputData.col(1).clone(); + + // scores + cvScores = cvOutputData.col(2).clone(); + + boxes = cvBoxes.ptr(0); + classes = cvClasses.ptr(0); + scores = cvScores.ptr(0); + + } else { + boxes = reinterpret_cast(inferResults[0]); + classes = reinterpret_cast(inferResults[1]); + scores = reinterpret_cast(inferResults[2]); + number_of_detections = (int)(*reinterpret_cast(inferResults[3])); + } + LOGI("number_of_detections = %d", number_of_detections); @@ -1068,14 +1117,59 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults) return ret; } + // In case of object detection, + // a model may apply post-process but others may not. + // Thus, those cases should be hanlded separately. std::vector> inferDimInfo(outputData.dimInfo); + LOGI("inferDimInfo size: %d", outputData.dimInfo.size()); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); + LOGI("inferResults size: %d", inferResults.size()); + + float* boxes = nullptr; + float* classes = nullptr; + float* scores = nullptr; + int number_of_detections = 0; + + cv::Mat cvScores, cvClasses, cvBoxes; + if (outputData.dimInfo.size() == 1) { + + // there is no way to know how many objects are detect unless the number of objects aren't + // provided. In the case, each backend should provide the number of results manually. + // For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are + // written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7 + // indicats the image id. But it is useless if a batch mode isn't supported. + // So, use the 1st of 7. + + number_of_detections = (int)(*reinterpret_cast(outputData.data[0])); + cv::Mat cvOutputData(number_of_detections, inferDimInfo[0][3], CV_32F, outputData.data[0]); + + // boxes + cv::Mat cvTop = cvOutputData.col(6).clone(); + cv::Mat cvLeft = cvOutputData.col(3).clone(); + cv::Mat cvBottom = cvOutputData.col(4).clone(); + cv::Mat cvRight = cvOutputData.col(5).clone(); - float* boxes = reinterpret_cast(inferResults[0]); - float* classes = reinterpret_cast(inferResults[1]); - float* scores = reinterpret_cast(inferResults[2]); + cv::Mat cvBoxElems[] = { cvTop, cvLeft, cvBottom, cvRight }; + cv::hconcat(cvBoxElems, 4, cvBoxes); + + // classes + cvClasses = cvOutputData.col(1).clone(); + + // scores + cvScores = cvOutputData.col(2).clone(); + + boxes = cvBoxes.ptr(0); + classes = cvClasses.ptr(0); + scores = cvScores.ptr(0); + + } else { + boxes = reinterpret_cast(inferResults[0]); + classes = reinterpret_cast(inferResults[1]); + scores = reinterpret_cast(inferResults[2]); + number_of_detections = (int)(*reinterpret_cast(inferResults[3])); + } - int number_of_detections = (int)(*reinterpret_cast(inferResults[3])); int left, top, right, bottom; cv::Rect loc; diff --git a/test/testsuites/inference/inference_test_suite.c b/test/testsuites/inference/inference_test_suite.c index 570b2b7..d984e38 100644 --- a/test/testsuites/inference/inference_test_suite.c +++ b/test/testsuites/inference/inference_test_suite.c @@ -45,6 +45,10 @@ #define OD_LABEL_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_label.txt" #define OD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/OD/tflite/od_tflite_model.tflite" +#define OD_OPENCV_LABEL_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_label_mobilenetv1ssd.txt" +#define OD_OPENCV_WEIGHT_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel" +#define OD_OPENCV_CONFIG_CAFFE_PATH "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt" + //Face Detection #define FD_TFLITE_WEIGHT_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_tflite_model1.tflite" @@ -1119,6 +1123,89 @@ int perform_tflite_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg) return err; } +int perform_opencv_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg) +{ + int err = MEDIA_VISION_ERROR_NONE; + + mv_engine_config_h handle = NULL; + err = mv_create_engine_config(&handle); + if (err != MEDIA_VISION_ERROR_NONE) { + printf("Fail to create engine configuration handle.\n"); + if (handle) { + int err2 = mv_destroy_engine_config(handle); + if (err2 != MEDIA_VISION_ERROR_NONE) { + printf("Fail to destroy engine cofniguration.\n"); + } + } + return err; + } + + + char *inputNodeName = "data"; + char *outputNodeName[1] = {"detection_out"}; + + mv_engine_config_set_string_attribute(handle, + MV_INFERENCE_MODEL_WEIGHT_FILE_PATH, + OD_OPENCV_WEIGHT_CAFFE_PATH); + + mv_engine_config_set_string_attribute(handle, + MV_INFERENCE_MODEL_CONFIGURATION_FILE_PATH, + OD_OPENCV_CONFIG_CAFFE_PATH); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_INPUT_DATA_TYPE, + MV_INFERENCE_DATA_FLOAT32); + + mv_engine_config_set_string_attribute(handle, + MV_INFERENCE_MODEL_USER_FILE_PATH, + OD_OPENCV_LABEL_CAFFE_PATH); + + mv_engine_config_set_double_attribute(handle, + MV_INFERENCE_MODEL_MEAN_VALUE, + 127.5); + + mv_engine_config_set_double_attribute(handle, + MV_INFERENCE_MODEL_STD_VALUE, + 127.5); + + mv_engine_config_set_double_attribute(handle, + MV_INFERENCE_CONFIDENCE_THRESHOLD, + 0.3); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_BACKEND_TYPE, + MV_INFERENCE_BACKEND_OPENCV); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_TARGET_TYPE, + MV_INFERENCE_TARGET_CPU); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_INPUT_TENSOR_WIDTH, + 300); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_INPUT_TENSOR_HEIGHT, + 300); + + mv_engine_config_set_int_attribute(handle, + MV_INFERENCE_INPUT_TENSOR_CHANNELS, + 3); + + mv_engine_config_set_string_attribute(handle, + MV_INFERENCE_INPUT_NODE_NAME, + inputNodeName); + + mv_engine_config_set_array_string_attribute(handle, + MV_INFERENCE_OUTPUT_NODE_NAMES, + outputNodeName, + 1); + + *engine_cfg = handle; + return err; +} + + int perform_armnn_mobilenetv1ssd_config(mv_engine_config_h *engine_cfg) { int err = MEDIA_VISION_ERROR_NONE; @@ -1203,9 +1290,10 @@ int perform_object_detection() int err = MEDIA_VISION_ERROR_NONE; int sel_opt = 0; - const int options[5] = {1, 2, 3, 4, 5, 6}; - const *names[5] = { "Configuration", + const int options[7] = {1, 2, 3, 4, 5, 6, 7}; + const char *names[7] = { "Configuration", "TFLITE(CPU) + MobileNetV1+SSD", + "OPENCV(CPU) + MobileNetV1+SSD", "ARMNN(CPU) + MobileNetV1+SSD", "Prepare", "Run", @@ -1216,7 +1304,7 @@ int perform_object_detection() mv_source_h mvSource = NULL; while(sel_opt == 0) { - sel_opt = show_menu("Select Action:", options, names, 5); + sel_opt = show_menu("Select Action:", options, names, 7); switch (sel_opt) { case 1: { @@ -1244,21 +1332,34 @@ int perform_object_detection() err = perform_tflite_mobilenetv1ssd_config(&engine_cfg); } break; + case 3: { - //perform ARMNN MobileSSD config + //perform OpenCV MobileSSD config if (engine_cfg) { int err2 = mv_destroy_engine_config(engine_cfg); if (err2 != MEDIA_VISION_ERROR_NONE) printf("Fail to destroy engine_cfg [err:%i]\n", err2); } - err = perform_armnn_mobilenetv1ssd_config(&engine_cfg); + err = perform_opencv_mobilenetv1ssd_config(&engine_cfg); } break; case 4: { + //perform ARMNN MobileSSD config + if (engine_cfg) { + int err2 = mv_destroy_engine_config(engine_cfg); + if (err2 != MEDIA_VISION_ERROR_NONE) + printf("Fail to destroy engine_cfg [err:%i]\n", err2); + } + + err = perform_armnn_mobilenetv1ssd_config(&engine_cfg); + } + break; + case 5: + { // create - configure - prepare if (infer) { int err2 = mv_inference_destroy(infer); @@ -1290,7 +1391,7 @@ int perform_object_detection() } } break; - case 5: + case 6: { if (mvSource) { int err2 = mv_destroy_source(mvSource); @@ -1338,7 +1439,7 @@ int perform_object_detection() } break; - case 6: + case 7: { //perform destroy if (engine_cfg) { -- 2.7.4