From 775d4ec34c0b95a6a58e697857a640f2ab6246aa Mon Sep 17 00:00:00 2001
From: Sangjung Woo <sangjung.woo@samsung.com>
Date: Mon, 26 Oct 2020 11:24:57 +0900
Subject: [PATCH] WIP: Performance improvement

* Apply ml_single_invoke_no_alloc() ML API instead of
  ml_single_invoke().
* Remove unnecessary memory copies.

Signed-off-by: Sangjung Woo <sangjung.woo@samsung.com>
---
 src/inference_engine_mlapi.cpp       | 215 ++++++++++++++++++++---------------
 src/inference_engine_mlapi_private.h |   3 +
 2 files changed, 128 insertions(+), 90 deletions(-)
diff --git a/src/inference_engine_mlapi.cpp b/src/inference_engine_mlapi.cpp
index 2e3c0a2..706ec29 100644
--- a/src/inference_engine_mlapi.cpp
+++ b/src/inference_engine_mlapi.cpp
@@ -23,6 +23,12 @@
 #include <time.h>
 #include <queue>
 
+// TODO. Below is test code. DO NOT use ML internal function.
+#define ENABLE_NO_ALLOC
+#if defined(ENABLE_NO_ALLOC)
+extern "C" int ml_single_invoke_no_alloc(ml_single_h single, const ml_tensors_data_h input, ml_tensors_data_h output);
+#endif
+
 namespace InferenceEngineImpl
 {
 namespace MLAPIImpl
@@ -31,6 +37,8 @@ namespace MLAPIImpl
 			mPluginType(),
 			mTargetDevice(),
 			mSingle(),
+			mInputInfoHandle(),
+			mOutputInfoHandle(),
 			mInputDataHandle(),
 			mOutputDataHandle(),
 			mDesignated_inputs(),
@@ -53,12 +61,20 @@ namespace MLAPIImpl
 
 		ml_single_close(mSingle);
 
+		if (mInputInfoHandle)
+			ml_tensors_info_destroy(mInputInfoHandle);
+
+		if (mOutputInfoHandle)
+			ml_tensors_info_destroy(mOutputInfoHandle);
+
 		if (mInputDataHandle)
 			ml_tensors_data_destroy(mInputDataHandle);
 
 		if (mOutputDataHandle)
 			ml_tensors_data_destroy(mOutputDataHandle);
 
+		mInputInfoHandle = NULL;
+		mOutputInfoHandle = NULL;
 		mInputDataHandle = NULL;
 		mOutputDataHandle = NULL;
 	}
@@ -192,16 +208,22 @@ namespace MLAPIImpl
 		// TODO. create ml_tensor_info for input and output tensor and pass
 		//		 them as parameters of ml_single_open function.
 
-		int ret = ml_single_open(&mSingle, model_str.c_str(), NULL, NULL,
+		int err = ml_single_open(&mSingle, model_str.c_str(), NULL, NULL,
 								 nnfw_type, nnfw_hw);
-		if (ret != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_open(%d).", ret);
+		if (err != ML_ERROR_NONE) {
+			LOGE("Failed to request ml_single_open(%d).", err);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 		}
 
+		err = UpdateTensorsInfo();
+		if (err != INFERENCE_ENGINE_ERROR_NONE) {
+			ml_single_close(mSingle);
+			mSingle = NULL;
+		}
+
 		LOGI("LEAVE");
 
-		return INFERENCE_ENGINE_ERROR_NONE;
+		return err;
 	}
 
 	int InferenceMLAPI::GetInputTensorBuffers(
@@ -209,25 +231,18 @@ namespace MLAPIImpl
 	{
 		LOGI("ENTER");
 
-		buffers.clear();
-
 		// TODO. Implement this function according to a given ML Single API backend properly.
 
-		ml_tensors_info_h in_info = NULL;
-
-		int ret = ml_single_get_input_info(mSingle, &in_info);
-		if (ret != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_get_input_info(%d).", ret);
-			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
-		}
-
 		// ML Single API will always provide internal tensor buffers so
 		// get the tensor buffers back to Mediavision framework so that
 		// Mediavision framework doesn't allocate the tensor buffers internally.
 
+		buffers.clear();
+
+		int ret;
 		unsigned int cnt;
 
-		ret = ml_tensors_info_get_count(in_info, &cnt);
+		ret = ml_tensors_info_get_count(mInputInfoHandle, &cnt);
 		if (ret != ML_ERROR_NONE) {
 			LOGE("Failed to request ml_tensors_info_get_count(%d).", ret);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
@@ -235,15 +250,19 @@ namespace MLAPIImpl
 
 		LOGI("input tensor count = %u", cnt);
 
-		for (unsigned int i = 0; i < cnt; ++i) {
-			inference_engine_tensor_buffer in_buffer;
-			ml_tensor_type_e in_type;
-
-			ret = ml_tensors_data_create(in_info, &mInputDataHandle);
+		// TODO. Below is test code, should we allocate new buffer for every inference?
+		if (mInputDataHandle == NULL) {
+			ret = ml_tensors_data_create(mInputInfoHandle, &mInputDataHandle);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_data_create(%d).", ret);
 				return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 			}
+		}
+
+		// TODO. Cache tensor info and reduce function call in UpdateTensorsInfo()
+		for (unsigned int i = 0; i < cnt; ++i) {
+			inference_engine_tensor_buffer in_buffer;
+			ml_tensor_type_e in_type;
 
 			ret = ml_tensors_data_get_tensor_data(mInputDataHandle, i, &in_buffer.buffer, &in_buffer.size);
 			if (ret != ML_ERROR_NONE) {
@@ -253,10 +272,9 @@ namespace MLAPIImpl
 
 			LOGE("buffer = %p, size = %d\n", in_buffer.buffer, in_buffer.size);
 
-			int ret = ml_tensors_info_get_tensor_type(in_info, i, &in_type);
+			ret = ml_tensors_info_get_tensor_type(mInputInfoHandle, i, &in_type);
 			if (ret != ML_ERROR_NONE) {
-				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).",
-					 ret);
+				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).", ret);
 				return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 			}
 
@@ -283,25 +301,18 @@ namespace MLAPIImpl
 	{
 		LOGI("ENTER");
 
-		buffers.clear();
-
 		// TODO. Need to check if model file loading is done.
 
-		ml_tensors_info_h out_info = NULL;
-
-		int ret = ml_single_get_output_info(mSingle, &out_info);
-		if (ret != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_get_output_info(%d).", ret);
-			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
-		}
-
 		// ML Single API will always provide internal tensor buffers so
 		// get the tensor buffers back to Mediavision framework so that
 		// Mediavision framework doesn't allocate the tensor buffers internally.
 
+		buffers.clear();
+
+		int ret;
 		unsigned int cnt;
 
-		ret = ml_tensors_info_get_count(out_info, &cnt);
+		ret = ml_tensors_info_get_count(mOutputInfoHandle, &cnt);
 		if (ret != ML_ERROR_NONE) {
 			LOGE("Failed to request ml_tensors_info_get_count(%d).", ret);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
@@ -309,15 +320,19 @@ namespace MLAPIImpl
 
 		LOGI("output tensor count = %u", cnt);
 
-		for (unsigned int i = 0; i < cnt; ++i) {
-			inference_engine_tensor_buffer out_buffer;
-			ml_tensor_type_e out_type;
-
-			ret = ml_tensors_data_create(out_info, &mOutputDataHandle);
+		// TODO. Below is test code, should we allocate new buffer for every inference?
+		if (mOutputDataHandle == NULL) {
+			ret = ml_tensors_data_create(mOutputInfoHandle, &mOutputDataHandle);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_data_create(%d).", ret);
 				return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 			}
+		}
+
+		// TODO. Cache tensor info and reduce function call in UpdateTensorsInfo()
+		for (unsigned int i = 0; i < cnt; ++i) {
+			inference_engine_tensor_buffer out_buffer;
+			ml_tensor_type_e out_type;
 
 			ret = ml_tensors_data_get_tensor_data(mOutputDataHandle, i, &out_buffer.buffer, &out_buffer.size);
 			if (ret != ML_ERROR_NONE) {
@@ -327,10 +342,9 @@ namespace MLAPIImpl
 
 			LOGE("buffer = %p, size = %d\n", out_buffer.buffer, out_buffer.size);
 
-			ret = ml_tensors_info_get_tensor_type(out_info, i, &out_type);
+			ret = ml_tensors_info_get_tensor_type(mOutputInfoHandle, i, &out_type);
 			if (ret != ML_ERROR_NONE) {
-				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).",
-					 ret);
+				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).", ret);
 				return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 			}
 
@@ -357,18 +371,11 @@ namespace MLAPIImpl
 	{
 		LOGI("ENTER");
 
-		ml_tensors_info_h in_info = NULL;
-
 		// TODO. Need to check if model file loading is done.
-
-		int ret = ml_single_get_input_info(mSingle, &in_info);
-		if (ret != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_get_input_info(%d).", ret);
-			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
-		}
-
+		int ret;
 		unsigned int cnt;
-		ret = ml_tensors_info_get_count(in_info, &cnt);
+
+		ret = ml_tensors_info_get_count(mInputInfoHandle, &cnt);
 		if (ret != ML_ERROR_NONE) {
 			LOGE("Failed to request ml_tensors_info_get_count(%d).", ret);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
@@ -379,11 +386,11 @@ namespace MLAPIImpl
 		for (unsigned int i = 0; i < cnt; ++i) {
 			inference_engine_tensor_info tensor_info;
 			ml_tensor_type_e in_type;
-			unsigned int in_dim[ML_TENSOR_RANK_LIMIT];
+			ml_tensor_dimension in_dim;
 			char *in_name = NULL;
 			size_t in_size = 1;
 
-			ret = ml_tensors_info_get_tensor_type(in_info, i, &in_type);
+			ret = ml_tensors_info_get_tensor_type(mInputInfoHandle, i, &in_type);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).",
 					 ret);
@@ -397,7 +404,7 @@ namespace MLAPIImpl
 				return INFERENCE_ENGINE_ERROR_NOT_SUPPORTED;
 			}
 
-			ret = ml_tensors_info_get_tensor_dimension(in_info, i, in_dim);
+			ret = ml_tensors_info_get_tensor_dimension(mInputInfoHandle, i, in_dim);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_dimension(%d).",
 					 ret);
@@ -413,7 +420,7 @@ namespace MLAPIImpl
 
 			LOGI("input tensor size = %zu", in_size);
 
-			ret = ml_tensors_info_get_tensor_name(in_info, i, &in_name);
+			ret = ml_tensors_info_get_tensor_name(mInputInfoHandle, i, &in_name);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_name(%d).",
 					 ret);
@@ -442,18 +449,11 @@ namespace MLAPIImpl
 	{
 		LOGI("ENTER");
 
-		ml_tensors_info_h out_info = NULL;
-
 		// TODO. Need to check if model file loading is done.
-
-		int ret = ml_single_get_output_info(mSingle, &out_info);
-		if (ret != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_get_output_info(%d).", ret);
-			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
-		}
-
+		int ret;
 		unsigned int cnt;
-		ret = ml_tensors_info_get_count(out_info, &cnt);
+
+		ret = ml_tensors_info_get_count(mOutputInfoHandle, &cnt);
 		if (ret != ML_ERROR_NONE) {
 			LOGE("Failed to request ml_tensors_info_get_count(%d).", ret);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
@@ -468,7 +468,7 @@ namespace MLAPIImpl
 			char *out_name = NULL;
 			size_t out_size = 1;
 
-			ret = ml_tensors_info_get_tensor_type(out_info, i, &out_type);
+			ret = ml_tensors_info_get_tensor_type(mOutputInfoHandle, i, &out_type);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_type(%d).",
 					 ret);
@@ -482,7 +482,7 @@ namespace MLAPIImpl
 				return INFERENCE_ENGINE_ERROR_NOT_SUPPORTED;
 			}
 
-			ret = ml_tensors_info_get_tensor_dimension(out_info, i, out_dim);
+			ret = ml_tensors_info_get_tensor_dimension(mOutputInfoHandle, i, out_dim);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_dimension(%d).",
 					 ret);
@@ -513,7 +513,7 @@ namespace MLAPIImpl
 
 			LOGI("output tensor size = %zu", out_size);
 
-			ret = ml_tensors_info_get_tensor_name(out_info, i, &out_name);
+			ret = ml_tensors_info_get_tensor_name(mOutputInfoHandle, i, &out_name);
 			if (ret != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_info_get_tensor_name(%d).",
 					 ret);
@@ -554,7 +554,7 @@ namespace MLAPIImpl
 
 		// TODO. Request input property information to a given ML Single API of nnstreamer backend,
 		// and set it instead of user-given one,
-
+		// Call UpdateTensorsInfo() after requesting input info.
 		mDesignated_inputs = property.layer_names;
 		mInputProperty = property;
 
@@ -580,7 +580,7 @@ namespace MLAPIImpl
 
 		// TODO. Request output property information to a given ML Single API of nnstreamer backend,
 		// and set it instead of user-given one,
-
+		// Call UpdateTensorsInfo() after requesting output info.
 		mDesignated_outputs = property.layer_names;
 		mOutputProperty = property;
 
@@ -647,6 +647,41 @@ namespace MLAPIImpl
 		return -1;
 	}
 
+	int InferenceMLAPI::UpdateTensorsInfo()
+	{
+		LOGI("ENTER");
+
+		if (!mSingle) {
+			LOGE("Invalid state, single-shot handle is not initialized.");
+			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
+		}
+
+		if (mInputInfoHandle) {
+			ml_tensors_info_destroy(mInputInfoHandle);
+			mInputInfoHandle = NULL;
+		}
+
+		if (mOutputInfoHandle) {
+			ml_tensors_info_destroy(mOutputInfoHandle);
+			mOutputInfoHandle = NULL;
+		}
+
+		int ret = ml_single_get_input_info(mSingle, &mInputInfoHandle);
+		if (ret != ML_ERROR_NONE) {
+			LOGE("Failed to request ml_single_get_input_info(%d).", ret);
+			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
+		}
+
+		ret = ml_single_get_output_info(mSingle, &mOutputInfoHandle);
+		if (ret != ML_ERROR_NONE) {
+			LOGE("Failed to request ml_single_get_output_info(%d).", ret);
+			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
+		}
+
+		LOGI("LEAVE");
+		return INFERENCE_ENGINE_ERROR_NONE;
+	}
+
 	int InferenceMLAPI::Run(
 			std::vector<inference_engine_tensor_buffer> &input_buffers,
 			std::vector<inference_engine_tensor_buffer> &output_buffers)
@@ -659,46 +694,46 @@ namespace MLAPIImpl
 			return err;
 		}
 
-		err = ml_single_invoke(mSingle, mInputDataHandle, &mOutputDataHandle);
+#if defined(ENABLE_NO_ALLOC)
+		err = ml_single_invoke_no_alloc(mSingle, mInputDataHandle, mOutputDataHandle);
 		if (err != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_invoke(%d).", err);
+			LOGE("Failed to request ml_single_invoke_no_alloc(%d).", err);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 		}
+#else
+		ml_tensors_data_h out_data = NULL;
+		void *data_ptr;
+		size_t data_size;
+		unsigned int out_cnt;
 
-		ml_tensors_info_h out_info = NULL;
-
-		err = ml_single_get_output_info(mSingle, &out_info);
+		err = ml_tensors_info_get_count(mOutputInfoHandle, &out_cnt);
 		if (err != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_single_get_output_info(%d).", err);
+			LOGE("Failed to request ml_tensors_info_get_count(%d).", err);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 		}
 
-		unsigned int out_cnt;
-
-		err = ml_tensors_info_get_count(out_info, &out_cnt);
+		// Be carefull, ml_single_invoke() returns newly allocated output handle.
+		err = ml_single_invoke(mSingle, mInputDataHandle, &out_data);
 		if (err != ML_ERROR_NONE) {
-			LOGE("Failed to request ml_tensors_info_get_count(%d).", err);
+			LOGE("Failed to request ml_single_invoke(%d).", err);
 			return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 		}
 
-		// TODO. Why below code is required?
-		// ML Single API provides internal tensor buffer for output tensor
-		// and user alreadys know the buffer by GetOutputTensorBuffers.
-		//
-		// However, without below code, user cannot get the output result
-		// correctly. What happens in ML Single API framework?
 		for (unsigned int i = 0; i < out_cnt; ++i) {
-			err = ml_tensors_data_get_tensor_data(
-				mOutputDataHandle, i, (void **) &output_buffers[i].buffer,
-				&output_buffers[i].size);
+			err = ml_tensors_data_get_tensor_data(out_data, i, &data_ptr, &data_size);
 			if (err != ML_ERROR_NONE) {
 				LOGE("Failed to request ml_tensors_data_get_tensor_data(%d).", err);
+				ml_tensors_data_destroy(out_data);
 				return INFERENCE_ENGINE_ERROR_INVALID_OPERATION;
 			}
 
+			// TODO. Remove memcpy() using ml_single_invoke_fill() later.
+			memcpy(output_buffers[i].buffer, data_ptr, output_buffers[i].size);
 			LOGI("Output tensor[%u] = %zu", i, output_buffers[i].size);
 		}
 
+		ml_tensors_data_destroy(out_data);
+#endif
 		LOGI("LEAVE");
 
 		return INFERENCE_ENGINE_ERROR_NONE;
diff --git a/src/inference_engine_mlapi_private.h b/src/inference_engine_mlapi_private.h
index b6b4b1e..b34cfc8 100644
--- a/src/inference_engine_mlapi_private.h
+++ b/src/inference_engine_mlapi_private.h
@@ -77,10 +77,13 @@ namespace MLAPIImpl
 				std::vector<inference_engine_tensor_buffer> &input_buffers,
 				std::vector<inference_engine_tensor_buffer> &output_buffers);
 		int ConvertTensorType(int tensor_type);
+		int UpdateTensorsInfo();
 
 		int mPluginType;
 		int mTargetDevice;
 		ml_single_h mSingle;
+		ml_tensors_info_h mInputInfoHandle;
+		ml_tensors_info_h mOutputInfoHandle;
 		ml_tensors_data_h mInputDataHandle;
 		ml_tensors_data_h mOutputDataHandle;
 		std::vector<std::string> mDesignated_inputs;
-- 
2.7.4