From: Tae-Young Chung <ty83.chung@samsung.com>
Date: Wed, 8 Apr 2020 04:09:26 +0000 (+0900)
Subject: test: Add OPENCV backedn test cases
X-Git-Tag: submit/tizen/20200423.063253~15
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d88a87d197b42429accd58d00613e0eba9a923d8;p=platform%2Fcore%2Fmultimedia%2Finference-engine-interface.git

test: Add OPENCV backedn test cases

Change-Id: Ie39cad370751d89adba608ca3c9e781369c52bba
Signed-off-by: Tae-Young Chung <ty83.chung@samsung.com>
---

diff --git a/test/res/face_detection_caffe.bin b/test/res/face_detection_caffe.bin
new file mode 100644
index 0000000..cca305a
Binary files /dev/null and b/test/res/face_detection_caffe.bin differ
diff --git a/test/res/faciallandmark_detection_caffe.bin b/test/res/faciallandmark_detection_caffe.bin
new file mode 100644
index 0000000..d777b57
Binary files /dev/null and b/test/res/faciallandmark_detection_caffe.bin differ
diff --git a/test/res/image_classification_caffe.bin b/test/res/image_classification_caffe.bin
new file mode 100644
index 0000000..3bab81b
Binary files /dev/null and b/test/res/image_classification_caffe.bin differ
diff --git a/test/res/object_detection_caffe.bin b/test/res/object_detection_caffe.bin
new file mode 100644
index 0000000..7749b5a
Binary files /dev/null and b/test/res/object_detection_caffe.bin differ
diff --git a/test/src/inference_engine_test.cpp b/test/src/inference_engine_test.cpp
index 48b194c..82c386f 100644
--- a/test/src/inference_engine_test.cpp
+++ b/test/src/inference_engine_test.cpp
@@ -36,6 +36,7 @@ typedef std::tuple<std::string, int, int, int, int, std::vector<std::string>, in
 class InferenceEngineCommonTest : public testing::TestWithParam<ParamType> { };
 class InferenceEngineCommonTest_2 : public testing::TestWithParam<ParamType_Load> { };
 class InferenceEngineTfliteTest : public testing::TestWithParam<ParamType_Infer> { };
+class InferenceEngineCaffeTest : public testing::TestWithParam<ParamType_Infer> { };
 
 std::map<std::string, int> Model_Formats = {
 	{ "caffemodel", INFERENCE_MODEL_CAFFE },
@@ -51,7 +52,7 @@ enum {
 	TEST_IMAGE_CLASSIFICATION = 0,
 	TEST_OBJECT_DETECTION,
 	TEST_FACE_DETECTION,
-	TEST_FACILA_LANDMARK_DETECTION,
+	TEST_FACIAL_LANDMARK_DETECTION,
 	TEST_POSE_ESTIMATION
 };
 
@@ -356,10 +357,42 @@ int VerifyObjectDetectionResults(tensor_t &outputData, std::vector<int> &answers
 {
 	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
 	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
-	float *boxes = reinterpret_cast<float *>(inferResults[0]);
-	float *scores = reinterpret_cast<float *>(inferResults[2]);
 
-	int num_of_detections = (int)(*reinterpret_cast<float *>(inferResults[3]));
+	float* boxes = nullptr;
+	float* classes = nullptr;
+	float* scores = nullptr;
+	int num_of_detections = 0;
+
+	if (outputData.dimInfo.size() == 1) {
+		// there is no way to know how many objects are detect unless the number of objects aren't
+		// provided. In the case, each backend should provide the number of results manually.
+		// For example, in OpenCV, MobilenetV1-SSD doesn't provide it so the number of objects are
+		// written to the 1st element i.e., outputData.data[0] (the shape is 1x1xNx7 and the 1st of 7
+		// indicats the image id. But it is useless if a batch mode isn't supported.
+		// So, use the 1st of 7.
+
+		num_of_detections = (int)(*reinterpret_cast<float*>(outputData.data[0]));
+
+		boxes = new float[num_of_detections * 4];
+		classes = new float[num_of_detections];
+		scores = new float[num_of_detections];
+
+		for (int idx = 0; idx < num_of_detections; ++idx) {
+			classes[idx] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 1];
+			scores[idx] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 2];
+
+			boxes[idx*4] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 4];
+			boxes[idx*4  + 1] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 3];
+			boxes[idx*4  + 2] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 6];
+			boxes[idx*4  + 3] = (reinterpret_cast<float*>(outputData.data[0]))[idx*inferDimInfo[0][3] + 5];
+		}
+	} else {
+		boxes = reinterpret_cast<float*>(inferResults[0]);
+		classes = reinterpret_cast<float*>(inferResults[1]);
+		scores = reinterpret_cast<float*>(inferResults[2]);
+		num_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+	}
+
 	int left = 0, top = 0, right = 0, bottom = 0;
 	float max_score = 0.0f;
 
@@ -374,9 +407,40 @@ int VerifyObjectDetectionResults(tensor_t &outputData, std::vector<int> &answers
 		}
 	}
 
+	if (outputData.dimInfo.size() == 1) {
+		delete [] boxes;
+		delete [] classes;
+		delete [] scores;
+	}
+
 	return (answers[0] == left && answers[1] == top && answers[2] == right && answers[3] == bottom);
 }
 
+int VerifyFacialLandmarkDetectionResults(tensor_t &outputData, std::vector<int> &answers, int height, int width)
+{
+	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+	std::vector<int> result_x, result_y;
+
+	long number_of_detections = inferDimInfo[0][1];
+	float* loc = reinterpret_cast<float*>(inferResults[0]);
+
+	for (int idx = 0; idx < number_of_detections; idx+=2) {
+		result_x.push_back((int)(loc[idx] * width));
+		result_y.push_back((int)(loc[idx+1] * height));
+	}
+
+	int ret = 1;
+	for (int i = 0; i < (number_of_detections>>1); i++) {
+		if (result_x[i] != answers[i*2] || result_y[i] != answers[i*2 + 1]) {
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
 int VerifyPoseEstimationResults(tensor_t &outputData, std::vector<int> &answers, int height, int width)
 {
 	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
@@ -454,8 +518,8 @@ TEST_P(InferenceEngineTfliteTest, Inference)
 	case TEST_FACE_DETECTION:
 		test_name.append("Face detection");
 		break;
-	case TEST_FACILA_LANDMARK_DETECTION:
-		test_name.append("Facila landmark detection");
+	case TEST_FACIAL_LANDMARK_DETECTION:
+		test_name.append("Facial landmark detection");
 		break;
 	case TEST_POSE_ESTIMATION:
 		test_name.append("Pose estimation");
@@ -589,7 +653,7 @@ TEST_P(InferenceEngineTfliteTest, Inference)
 		ret = VerifyObjectDetectionResults(result, answers, 1152, 1536);
 		EXPECT_EQ(ret, 1);
 		break;
-	case TEST_FACILA_LANDMARK_DETECTION:
+	case TEST_FACIAL_LANDMARK_DETECTION:
 		// TODO.
 		break;
 	case TEST_POSE_ESTIMATION:
@@ -607,6 +671,194 @@ TEST_P(InferenceEngineTfliteTest, Inference)
 	delete engine;
 }
 
+TEST_P(InferenceEngineCaffeTest, Inference)
+{
+	std::string backend_name;
+	int target_devices;
+	int test_type;
+	int iteration;
+	int tensor_type;
+	std::vector<std::string> image_paths;
+	int height;
+	int width;
+	int ch;
+	std::vector<std::string> input_layers;
+	std::vector<std::string> output_layers;
+	std::vector<std::string> model_paths;
+	std::vector<int> answers;
+
+	std::tie(backend_name, target_devices, test_type, iteration, tensor_type, image_paths, height, width, ch, input_layers, output_layers, model_paths, answers) = GetParam();
+
+	if (iteration < 1) {
+		iteration = 1;
+	}
+
+	std::string test_name;
+	switch (test_type) {
+	case TEST_IMAGE_CLASSIFICATION:
+		test_name.append("Image classification");
+		break;
+	case TEST_OBJECT_DETECTION:
+		test_name.append("Object detection");
+		break;
+	case TEST_FACE_DETECTION:
+		test_name.append("Face detection");
+		break;
+	case TEST_FACIAL_LANDMARK_DETECTION:
+		test_name.append("Facial landmark detection");
+		break;
+	case TEST_POSE_ESTIMATION:
+		test_name.append("Pose estimation");
+		break;
+	}
+
+	std::cout << test_name << " inference test : backend = " << backend_name << ", target device = " << (target_devices == INFERENCE_TARGET_CPU ? "CPU" : "GPU")  << "\n";
+
+	inference_engine_config config = {
+		.backend_name = backend_name,
+		.target_devices = target_devices
+	};
+
+	InferenceEngineCommon *engine = new InferenceEngineCommon(&config);
+	if (engine == nullptr) {
+		ASSERT_TRUE(engine);
+		return;
+	}
+
+	int ret = engine->EnableProfiler(true);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	ret = engine->DumpProfileToFile("dump.txt");
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	ret = engine->BindBackend(&config);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	inference_engine_capacity capacity;
+	ret = engine->GetBackendCapacity(&capacity);
+	EXPECT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+
+	ret = engine->SetTargetDevices(target_devices);
+	EXPECT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+
+	std::vector <std::string> models;
+	int model_type = GetModelInfo(model_paths, models);
+	if (model_type == -1) {
+		delete engine;
+		ASSERT_NE(model_type, -1);
+		return;
+	}
+
+	inference_engine_layer_property input_property;
+	std::vector<std::string>::iterator iter;
+
+	for (iter = input_layers.begin(); iter != input_layers.end(); iter++) {
+		inference_engine_tensor_info tensor_info = {
+			{ 1, ch, height, width },
+			(inference_tensor_shape_type_e)TENSOR_SHAPE_NCHW,
+			(inference_tensor_data_type_e)tensor_type,
+			(size_t)(1 * ch * height * width)
+		};
+
+		input_property.layer_names.push_back(*iter);
+		input_property.tensor_infos.push_back(tensor_info);
+    }
+
+	ret = engine->SetInputLayerProperty(input_property);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	inference_engine_layer_property output_property;
+
+	for (iter = output_layers.begin(); iter != output_layers.end(); iter++) {
+		output_property.layer_names.push_back(*iter);
+	}
+
+	ret = engine->SetOutputLayerProperty(output_property);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	ret = engine->Load(models, (inference_model_format_e)model_type);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	std::vector<inference_engine_tensor_buffer> inputs, outputs;
+	ret = PrepareTensorBuffers(engine, inputs, outputs);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		delete engine;
+		ASSERT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+		return;
+	}
+
+	// Copy input image tensor data from a given file to input tensor buffer.
+	for (int i = 0; i < (int)image_paths.size(); ++i) {
+		CopyFileToMemory(image_paths[i].c_str(), inputs[i], inputs[i].size);
+	}
+
+	for (int repeat = 0; repeat < iteration; ++repeat) {
+		ret = engine->Run(inputs, outputs);
+		EXPECT_EQ(ret, INFERENCE_ENGINE_ERROR_NONE);
+	}
+
+	tensor_t result;
+	FillOutputResult(engine, outputs, result);
+
+	switch (test_type) {
+	case TEST_IMAGE_CLASSIFICATION:
+		ret = VerifyImageClassificationResults(result, answers[0]);
+		EXPECT_EQ(ret, 1);
+		break;
+	case TEST_OBJECT_DETECTION:
+		// 1024 : fixed height size of dumped image, 636 : fixed width size of dumped image.
+		ret = VerifyObjectDetectionResults(result, answers, 636, 1024);
+		EXPECT_EQ(ret, 1);
+		break;
+	case TEST_FACE_DETECTION:
+		// 1152 : fixed height size of dumped image, 1536 : fixed width size of dumped image.
+		ret = VerifyObjectDetectionResults(result, answers, 1152, 1536);
+		EXPECT_EQ(ret, 1);
+		break;
+	case TEST_FACIAL_LANDMARK_DETECTION:
+		// 128 : fixed height size of dumped image, 128 : fixed width size of dumped image.
+		ret = VerifyFacialLandmarkDetectionResults(result, answers, 128, 128);
+		EXPECT_EQ(ret, 1);
+		break;
+	case TEST_POSE_ESTIMATION:
+		// 563 : fixed height size of dumped image, 750 : fixed width size of dumped image.
+		ret = VerifyPoseEstimationResults(result, answers, 563, 750);
+		EXPECT_EQ(ret, 1);
+		break;
+	}
+
+	CleanupTensorBuffers(inputs, outputs);
+
+	engine->UnbindBackend();
+	models.clear();
+
+	delete engine;
+}
+
 INSTANTIATE_TEST_CASE_P(Prefix, InferenceEngineCommonTest,
 		testing::Values(
 			// parameter order : backend name, target device
@@ -618,6 +870,11 @@ INSTANTIATE_TEST_CASE_P(Prefix, InferenceEngineCommonTest,
 			ParamType("tflite", INFERENCE_TARGET_GPU),
 			// DLDT.
 			ParamType("dldt", INFERENCE_TARGET_CUSTOM)
+			// OPENCV.
+			ParamType("opencv", INFERENCE_TARGET_CPU),
+			ParamType("opencv", INFERENCE_TARGET_GPU)
+			/* TODO */
+
 		)
 );
 
@@ -630,7 +887,10 @@ INSTANTIATE_TEST_CASE_P(Prefix, InferenceEngineCommonTest_2,
 			ParamType_Load("armnn", INFERENCE_TARGET_GPU, { "/usr/share/capi-media-vision/models/IC/tflite/ic_tflite_model.tflite" }),
 			// TFLITE.
 			ParamType_Load("tflite", INFERENCE_TARGET_CPU, { "/usr/share/capi-media-vision/models/IC/tflite/ic_tflite_model.tflite" }),
-			ParamType_Load("tflite", INFERENCE_TARGET_GPU, { "/usr/share/capi-media-vision/models/IC/tflite/ic_tflite_model.tflite" })
+			ParamType_Load("tflite", INFERENCE_TARGET_GPU, { "/usr/share/capi-media-vision/models/IC/tflite/ic_tflite_model.tflite" }),
+			// OPENCV.
+			ParamType_Load("opencv", INFERENCE_TARGET_CPU, { "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.caffemodel", "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.prototxt" }),
+			ParamType_Load("opencv", INFERENCE_TARGET_GPU, { "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.caffemodel", "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.prototxt" })
 			/* TODO */
 		)
 );
@@ -682,3 +942,28 @@ INSTANTIATE_TEST_CASE_P(Prefix, InferenceEngineTfliteTest,
 			/* TODO */
 		)
 );
+
+INSTANTIATE_TEST_CASE_P(Prefix, InferenceEngineCaffeTest,
+		testing::Values(
+			// parameter order : backend_name, target_devices, test_type, iteration, tensor_type, image_paths, height, width, ch, input_layers, output_layers, model_paths, answers
+			// OPENCV
+			// squeezenet based image classification test
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_IMAGE_CLASSIFICATION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/image_classification_caffe.bin" }, 227, 227, 3, { "data" }, { "prob" }, { "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.caffemodel", "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.prototxt" }, { 281 }),
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_IMAGE_CLASSIFICATION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/image_classification_caffe.bin" }, 227, 227, 3, { "data" }, { "prob" }, { "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.caffemodel", "/usr/share/capi-media-vision/models/IC/caffe/ic_caffe_model_squeezenet.prototxt" }, { 281 }),
+
+			// mobilenet-ssd based object detection test
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_OBJECT_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/object_detection_caffe.bin" }, 300, 300, 3, { "data" }, { "detection_out" }, { "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel", "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt" }, { 15, 19, 335, 557 }),
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_OBJECT_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/object_detection_caffe.bin" }, 300, 300, 3, { "data" }, { "detection_out" }, { "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.caffemodel", "/usr/share/capi-media-vision/models/OD/caffe/od_caffe_model_mobilenetv1ssd.prototxt" }, { 15, 19, 335, 557 }),
+
+			// mobilenet-ssd based object detection test
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_FACE_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/face_detection_caffe.bin" }, 300, 300, 3, { "data" }, { "detection_out" }, { "/usr/share/capi-media-vision/models/FD/caffe/fd_caffe_model_resnet10ssd.caffemodel", "/usr/share/capi-media-vision/models/FD/caffe/fd_caffe_model_resnet10ssd.prototxt" }, { 733, 233, 965, 539 }),
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_FACE_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/face_detection_caffe.bin" }, 300, 300, 3, { "data" }, { "detection_out" }, { "/usr/share/capi-media-vision/models/FD/caffe/fd_caffe_model_resnet10ssd.caffemodel", "/usr/share/capi-media-vision/models/FD/caffe/fd_caffe_model_resnet10ssd.prototxt" }, { 733, 233, 965, 539 }),
+
+			// tweakcnn based facial landmark detection test
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_FACIAL_LANDMARK_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/faciallandmark_detection_caffe.bin" }, 128, 128, 3, { "data" }, { "Sigmoid_fc2" }, { "/usr/share/capi-media-vision/models/FLD/caffe/fld_caffe_model_tweak.caffemodel", "/usr/share/capi-media-vision/models/FLD/caffe/fld_caffe_model_tweak.prototxt" },
+							{ 53, 45, 85, 46, 66, 64, 54, 78, 82, 79}),
+			ParamType_Infer("opencv", INFERENCE_TARGET_CPU, TEST_FACIAL_LANDMARK_DETECTION, 10, TENSOR_DATA_TYPE_FLOAT32, { "/opt/usr/images/faciallandmark_detection_caffe.bin" }, 128, 128, 3, { "data" }, { "Sigmoid_fc2" }, { "/usr/share/capi-media-vision/models/FLD/caffe/fld_caffe_model_tweak.caffemodel", "/usr/share/capi-media-vision/models/FLD/caffe/fld_caffe_model_tweak.prototxt" },
+							{ 53, 45, 85, 46, 66, 64, 54, 78, 82, 79})
+			/* TODO */
+		)
+);
\ No newline at end of file