From: Inki Dae <inki.dae@samsung.com>
Date: Wed, 12 Feb 2020 02:35:21 +0000 (+0900)
Subject: mv_inference: Place OpenCV dependent code from inference-engine-vision
X-Git-Tag: submit/tizen/20200423.063253~38
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=494a50e8f2ddb187b58f27c8c38e4c9db8c1675b;p=platform%2Fcore%2Fapi%2Fmediavision.git

mv_inference: Place OpenCV dependent code from inference-engine-vision

This patch moves OpenCV dependent code from inference-engine-vision layer
to Inference layer. Now we can remove all inference-engine-vision relevent
files - inference_engine_vision_impl.cpp and inference_engine_vision_impl.h

Change-Id: I7750a4b8b85aba8a2d220fb4ee88f7007d6d5939
Signed-off-by: Inki Dae <inki.dae@samsung.com>
---

diff --git a/mv_inference/inference/include/Inference.h b/mv_inference/inference/include/Inference.h
index 223ad155..4c6a4f6e 100755
--- a/mv_inference/inference/include/Inference.h
+++ b/mv_inference/inference/include/Inference.h
@@ -24,6 +24,8 @@
 #include "inference_engine_error.h"
 #include "inference_engine_vision_impl.h"
 #include <mv_inference_type.h>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
 
 /**
  * @file Inference.h
@@ -31,6 +33,33 @@
  *        provides inference interface.
  */
 using namespace InferenceEngineInterface::Vision;
+
+typedef struct _ImageClassficationResults {
+    int number_of_classes;
+    std::vector<int> indices;
+    std::vector<std::string> names;
+    std::vector<float> confidences;
+} ImageClassificationResults; /**< structure ImageClassificationResults */
+
+typedef struct _ObjectDetectionResults {
+    int number_of_objects;
+    std::vector<int> indices;
+    std::vector<std::string> names;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> locations;
+} ObjectDetectionResults;  /**< structure ObjectDetectionResults */
+
+typedef struct _FaceDetectionResults {
+    int number_of_faces;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> locations;
+} FaceDetectionResults;  /**< structure FaceDetectionResults */
+
+typedef struct _FacialLandMarkDetectionResults {
+    int number_of_landmarks;
+    std::vector<cv::Point> locations;
+} FacialLandMarkDetectionResults;  /**< structure FacialLandMarkDetectionResults */
+
 namespace mediavision {
 namespace inference {
 
@@ -262,20 +291,33 @@ public:
 
 private:
 	bool mCanRun; /**< The flag indicating ready to run Inference */
+    InferenceConfig mConfig;
+	inference_engine_capacity mBackendCapacity;
+	std::map<int, std::pair<std::string, bool>> mSupportedInferenceBackend;
+	cv::Size mInputSize;
+	int mCh;
+	int mDim;
+	double mDeviation;
+	double mMean;
+	double mThreshold;
+	int mOutputNumbers;
+	cv::Size mSourceSize;
+	cv::Mat mInputBuffer;
+	int mMatType;
 
-	InferenceConfig mConfig;
 	mv_engine_config_h engine_config;
-	inference_engine_capacity mBackendCapacity;
 
 	InferenceEngineVision * mBackend;
 
-	std::map<int, std::pair<std::string, bool>> mSupportedInferenceBackend;
 	std::map<std::string, int> mModelFormats;
+    std::vector<std::string> mUserListName;
 
 private:
 	void CheckSupportedInferenceBackend();
 	int ConvertEngineErrorToVisionError(int error);
 	int ConvertTargetTypes(int given_types);
+	int DoPreprocess(cv::Mat cvImg);
+	int SetUserFile(std::string filename);
 };
 
 } /* Inference */
diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp
index ee72a6d6..4ef3f66f 100755
--- a/mv_inference/inference/src/Inference.cpp
+++ b/mv_inference/inference/src/Inference.cpp
@@ -23,6 +23,7 @@
 #include <unistd.h>
 #include <fstream>
 #include <string>
+#include <queue>
 #include <algorithm>
 
 #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
@@ -30,6 +31,16 @@
 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
 
+typedef enum  {
+  InputAttrNoType = 0,
+  InputAttrFloat32 = 1,
+  InputAttrInt32 = 2,
+  InputAttrUInt8 = 3,
+  InputAttrInt64 = 4,
+  InputAttrString = 5,
+  InputAttrBool = 6,
+} InputAttrType;
+
 namespace mediavision {
 namespace inference {
 InferenceConfig::InferenceConfig() :
@@ -53,7 +64,17 @@ Inference::Inference() :
 	mCanRun(),
 	mConfig(),
 	mBackendCapacity(),
-	mSupportedInferenceBackend()
+	mSupportedInferenceBackend(),
+    mInputSize(cv::Size()),
+    mCh(0),
+    mDim(0),
+    mDeviation(0.0),
+    mMean(0.0),
+    mThreshold(0.0),
+    mOutputNumbers(0),
+    mSourceSize(cv::Size()),
+    mInputBuffer(cv::Mat()),
+    mMatType(0)
 {
 	LOGI("ENTER");
 
@@ -178,6 +199,68 @@ int Inference::ConvertTargetTypes(int given_types)
 	return target_types;
 }
 
+int Inference::DoPreprocess(cv::Mat cvImg)
+{
+    mSourceSize = cvImg.size();
+    int width = mInputSize.width;
+    int height = mInputSize.height;
+
+    cv::Mat sample;
+    if (cvImg.channels() == 3 && mCh == 1)
+        cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY);
+    else
+        sample = cvImg;
+
+    // size
+    cv::Mat sampleResized;
+    if (sample.size() != cv::Size(width, height))
+        cv::resize(sample, sampleResized, cv::Size(width, height));
+    else
+        sampleResized = sample;
+
+    // type
+    cv::Mat sampleFloat;
+    if (mCh == 3)
+        sampleResized.convertTo(sampleFloat, CV_32FC3);
+    else
+        sampleResized.convertTo(sampleFloat, CV_32FC1);
+
+    // normalize
+    cv::Mat sampleNormalized;
+    cv::Mat meanMat;
+    if (mCh == 3)
+        meanMat = cv::Mat(sampleFloat.size(), CV_32FC3, cv::Scalar((float)mMean, (float)mMean, (float)mMean));
+    else
+        meanMat = cv::Mat(sampleFloat.size(), CV_32FC1, cv::Scalar((float)mMean));
+
+    cv::subtract(sampleFloat, meanMat, sampleNormalized);
+
+    sampleNormalized /= (float)mDeviation;
+
+    sampleNormalized.convertTo(mInputBuffer, mMatType);
+
+    return MEDIA_VISION_ERROR_NONE;
+}
+
+int Inference::SetUserFile(std::string filename)
+{
+    std::ifstream fp(filename.c_str());
+    if (!fp.is_open()) {
+        return MEDIA_VISION_ERROR_INVALID_PATH;
+    }
+
+    std::string userListName;
+    while (!fp.eof()) {
+        std::getline(fp, userListName);
+        if (userListName.length())
+			mUserListName.push_back(userListName);
+    }
+
+    fp.close();
+
+    return MEDIA_VISION_ERROR_NONE;
+}
+
 void Inference::ConfigureModelFiles(const std::string modelConfigFilePath,
 		    const std::string modelWeightFilePath,
 		    const std::string modelUserFilePath)
@@ -194,7 +277,7 @@ void Inference::ConfigureTensorInfo(int width,
 			double stdValue,
 			double meanValue)
 {
-    mConfig.mTensorInfo = {width, height, dim, ch};
+	mConfig.mTensorInfo = {width, height, dim, ch};
 	mConfig.mStdValue = stdValue;
 	mConfig.mMeanValue = meanValue;
 }
@@ -298,20 +381,22 @@ int Inference::Prepare(void)
 {
 	LOGI("ENTER");
 
-	// Input Tensor Param
-	mBackend->SetInputTensorParamInput(mConfig.mTensorInfo.width,
-                    mConfig.mTensorInfo.height,
-                    mConfig.mTensorInfo.dim,
-                    mConfig.mTensorInfo.ch);
+	mCh = mConfig.mTensorInfo.ch;
+	mDim = mConfig.mTensorInfo.dim;
+	mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
+	LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
 
-	mBackend->SetInputTensorParamNorm(mConfig.mStdValue, mConfig.mMeanValue);
+	mDeviation = mConfig.mStdValue;
+	mMean = mConfig.mMeanValue;
+	LOGI("mean %.4f, deviation %.4f", mMean, mDeviation);
 
 	mBackend->SetInputTensorParamNode(mConfig.mInputNodeName);
 
-	// Output Tensor Param
-	mBackend->SetOutputTensorParamNumbers(mConfig.mMaxOutputNumbers);
+	mOutputNumbers = mConfig.mMaxOutputNumbers;
+	LOGI("outputNumber %d", mOutputNumbers);
 
-	mBackend->SetOutputTensorParamThresHold(mConfig.mConfidenceThresHold);
+	mThreshold = mConfig.mConfidenceThresHold;
+	LOGI("threshold %.4f", mThreshold);
 
 	mBackend->SetOutputTensorParamNodes(mConfig.mOutputNodeNames);
 
@@ -348,6 +433,19 @@ int Inference::Load(void)
 {
 	LOGI("ENTER");
 
+    std::string label_file = mConfig.mUserFilePath;
+    size_t userFileLength = label_file.length();
+    if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
+        LOGE("Label file path in [%s] ", label_file.c_str());
+        return MEDIA_VISION_ERROR_INVALID_PARAMETER;
+    }
+
+    int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
+    if (ret != MEDIA_VISION_ERROR_NONE) {
+        LOGE("Fail to load label file.");
+		return ret;
+    }
+
 	// Check if model file is valid or not.
 	std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
 	std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
@@ -378,19 +476,73 @@ int Inference::Load(void)
 		break;
 	}
 
-	models.push_back(mConfig.mUserFilePath);
-
     // Request model loading to backend engine.
-    int ret = mBackend->Load(models, (inference_model_format_e)key->second);
+    ret = mBackend->Load(models, (inference_model_format_e)key->second);
 	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 		delete mBackend;
 		LOGE("Fail to load model");
 		mCanRun = false;
-		goto out;
+		std::vector<std::string>().swap(models);
+		return ConvertEngineErrorToVisionError(ret);
 	}
 
+	//get type and allocate memory to mInputBuffer;
+	InputAttrType attrType = static_cast<InputAttrType>(mBackend->GetInputLayerAttrType());
+	if (attrType == InputAttrUInt8) {
+		LOGI("InputType is %d ch with UINT8", mCh);
+		if (mCh == 1) {
+			mMatType = CV_8UC1;
+		} else if (mCh == 3) {
+			mMatType = CV_8UC3;
+		} else {
+			LOGE("Not supported");
+			std::vector<std::string>().swap(models);
+			return ConvertEngineErrorToVisionError(ret);;
+		}
+	}
+	else if (attrType == InputAttrFloat32) {
+		LOGI("InputType is %d ch with FLOAT32", mCh);
+		if (mCh == 1) {
+			mMatType = CV_32FC1;
+		} else if (mCh == 3) {
+			mMatType = CV_32FC3;
+		} else {
+			LOGE("Not supported");
+			std::vector<std::string>().swap(models);
+			return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+		}
+	}
+	else {
+		LOGE("Not supported");
+		std::vector<std::string>().swap(models);
+		return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+	}
+
+	tensor_t inputData;
+	std::vector<int> info{1, mMatType, mInputSize.height, mInputSize.width};
+	inputData.dimInfo.push_back(info);
+
+	// some plug-in (opencv) doesn't allocate memory for input while loading models
+	// But, others (tflite) allcate memory while loading.
+	// Thus, the SetInputData() will be implemented in plug-in such as OpenCV, but
+	// just leave empty in plug-in such as tflite.
+	ret = mBackend->SetInputDataBuffer(inputData);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		LOGE("Fail to SetInputData");
+		return ConvertEngineErrorToVisionError(ret);;
+	}
+
+	void *dataPtr = mBackend->GetInputDataPtr();
+	if (dataPtr == nullptr) {
+		LOGE("input data address is null");
+		std::vector<std::string>().swap(models);
+		return MEDIA_VISION_ERROR_INTERNAL;
+	}
+
+	mInputBuffer = cv::Mat(mInputSize.height, mInputSize.width, mMatType, dataPtr);
+
 	mCanRun = true;
-out:
+
 	std::vector<std::string>().swap(models);
 
 	LOGI("LEAVE");
@@ -438,7 +590,15 @@ int Inference::Run(mv_source_h mvSource, mv_rectangle_s *roi)
 	}
 
 	LOGE("Size: w:%d, h:%d", cvSource.size().width, cvSource.size().height);
-	ret = mBackend->Run(cvSource);
+
+	// Convert color space of input tensor data and then normalize it.
+	ret = DoPreprocess(cvSource);
+	if (ret != MEDIA_VISION_ERROR_NONE) {
+		LOGE("Fail to preprocess input tensor data.");
+		return ret;
+	}
+
+	ret = mBackend->Run();
 
 	return ConvertEngineErrorToVisionError(ret);
 }
@@ -450,13 +610,63 @@ std::pair<std::string, bool> Inference::GetSupportedInferenceBackend(int backend
 
 int Inference::GetClassficationResults(ImageClassificationResults *classificationResults)
 {
-	ImageClassificationResults results;
-	int ret = mBackend->GetInferenceResult(results);
+	tensor_t outputData;
+	int ret = mBackend->GetInferenceResult(outputData);
 	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 		LOGE("Fail to GetClassificationResults");
 		return ConvertEngineErrorToVisionError(ret);
 	}
 
+	// Will contain top N results in ascending order.
+	std::vector<std::pair<float, int>> top_results;
+	std::priority_queue<std::pair<float, int>,
+		std::vector<std::pair<float, int>>,
+		std::greater<std::pair<float, int>>> top_result_pq;
+	float value;
+
+	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+	long count = inferDimInfo[0][1];
+	LOGI("count: %ld", count);
+
+	float *prediction = reinterpret_cast<float*>(inferResults[0]);
+	for (int i = 0; i < count; ++i) {
+		value = prediction[i];
+		// Only add it if it beats the threshold and has a chance at being in
+		// the top N.
+		top_result_pq.push(std::pair<float, int>(value, i));
+
+		// If at capacity, kick the smallest value out.
+		if (top_result_pq.size() > mOutputNumbers) {
+			top_result_pq.pop();
+		}
+	}
+
+	// Copy to output vector and reverse into descending order.
+	while (!top_result_pq.empty()) {
+		top_results.push_back(top_result_pq.top());
+		top_result_pq.pop();
+	}
+	std::reverse(top_results.begin(), top_results.end());
+
+	int classIdx = -1;
+	ImageClassificationResults results;
+	results.number_of_classes = 0;
+	for (int idx = 0; idx < top_results.size(); ++idx) {
+		if (top_results[idx].first < mThreshold)
+			continue;
+		LOGI("idx:%d", idx);
+		LOGI("classIdx: %d", top_results[idx].second);
+		LOGI("classProb: %f", top_results[idx].first);
+
+		classIdx = top_results[idx].second;
+		results.indices.push_back(classIdx);
+		results.confidences.push_back(top_results[idx].first);
+		results.names.push_back(mUserListName[classIdx]);
+		results.number_of_classes++;
+	}
+
 	*classificationResults = results;
 	LOGE("Inference: GetClassificationResults: %d\n", results.number_of_classes);
 	return MEDIA_VISION_ERROR_NONE;
@@ -464,13 +674,51 @@ int Inference::GetClassficationResults(ImageClassificationResults *classificatio
 
 int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResults)
 {
-	ObjectDetectionResults results;
-	int ret = mBackend->GetInferenceResult(results);
+	tensor_t outputData;
+	int ret = mBackend->GetInferenceResult(outputData);
 	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
 		LOGE("Fail to GetObjectDetectionResults");
 		return ConvertEngineErrorToVisionError(ret);
 	}
 
+	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+	float* boxes = reinterpret_cast<float*>(inferResults[0]);
+	float* classes = reinterpret_cast<float*>(inferResults[1]);
+	float* scores = reinterpret_cast<float*>(inferResults[2]);
+	int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+
+	int left, top, right, bottom;
+	cv::Rect loc;
+
+	ObjectDetectionResults results;
+	results.number_of_objects = 0;
+	for (int idx = 0; idx < number_of_detections; ++idx) {
+		if (scores[idx] < mThreshold)
+			continue;
+
+		left =   (int)(boxes[idx*4 + 1] * mSourceSize.width);
+		top  =   (int)(boxes[idx*4 + 0] * mSourceSize.height);
+		right  = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+		bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+		loc.x = left;
+		loc.y = top;
+		loc.width = right -left + 1;
+		loc.height = bottom - top + 1;
+
+		results.indices.push_back((int)classes[idx]);
+		results.confidences.push_back(scores[idx]);
+		results.names.push_back(mUserListName[(int)classes[idx]]);
+		results.locations.push_back(loc);
+		results.number_of_objects++;
+
+		LOGI("objectClass: %d", (int)classes[idx]);
+		LOGI("confidence:%f", scores[idx]);
+		LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+	}
+
 	*detectionResults = results;
 	LOGE("Inference: GetObjectDetectionResults: %d\n", results.number_of_objects);
 	return MEDIA_VISION_ERROR_NONE;
@@ -478,8 +726,49 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult
 
 int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
 {
+	tensor_t outputData;
+	int ret = mBackend->GetInferenceResult(outputData);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		LOGE("Fail to GetFaceDetectionResults");
+		return ConvertEngineErrorToVisionError(ret);
+	}
+
+	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+	float* boxes = reinterpret_cast<float*>(inferResults[0]);
+	float* classes = reinterpret_cast<float*>(inferResults[1]);
+	float* scores = reinterpret_cast<float*>(inferResults[2]);
+
+	int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+	int left, top, right, bottom;
+	cv::Rect loc;
+
 	FaceDetectionResults results;
-	mBackend->GetInferenceResult(results);
+	results.number_of_faces = 0;
+	for (int idx = 0; idx < number_of_detections; ++idx) {
+		if (scores[idx] < mThreshold)
+			continue;
+
+		left =   (int)(boxes[idx*4 + 1] * mSourceSize.width);
+		top  =   (int)(boxes[idx*4 + 0] * mSourceSize.height);
+		right  = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+		bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+		loc.x = left;
+		loc.y = top;
+		loc.width = right -left + 1;
+		loc.height = bottom - top + 1;
+
+		results.confidences.push_back(scores[idx]);
+		results.locations.push_back(loc);
+		results.number_of_faces++;
+
+		LOGI("confidence:%f", scores[idx]);
+		LOGI("class: %f", classes[idx]);
+		LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx*4 + 1], boxes[idx*4 + 0], boxes[idx*4 + 3], boxes[idx*4 + 2]);
+		LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+	}
 
 	*detectionResults = results;
 	LOGE("Inference: GetFaceDetectionResults: %d\n", results.number_of_faces);
@@ -488,8 +777,34 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
 
 int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *detectionResults)
 {
+	tensor_t outputData;
+	int ret = mBackend->GetInferenceResult(outputData);
+	if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+		LOGE("Fail to GetFacialLandMarkDetectionResults");
+		return ConvertEngineErrorToVisionError(ret);
+	}
+
+	std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+	std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+	long number_of_detections = inferDimInfo[0][1];
+	float* loc = reinterpret_cast<float*>(inferResults[0]);
+
 	FacialLandMarkDetectionResults results;
-	mBackend->GetInferenceResult(results);
+	results.number_of_landmarks = 0;
+
+	cv::Point point(0,0);
+	results.number_of_landmarks = 0;
+	LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
+	for (int idx = 0; idx < number_of_detections; idx+=2) {
+		point.x = (int)(loc[idx] * mSourceSize.width);
+		point.y = (int)(loc[idx+1] * mSourceSize.height);
+
+		results.locations.push_back(point);
+		results.number_of_landmarks++;
+
+		LOGI("x:%d, y:%d", point.x, point.y);
+	}
 
 	*detectionResults = results;
 	LOGE("Inference: FacialLandmarkDetectionResults: %d\n", results.number_of_landmarks);