From: Inki Dae Date: Wed, 12 Feb 2020 02:35:21 +0000 (+0900) Subject: mv_inference: Place OpenCV dependent code from inference-engine-vision X-Git-Tag: submit/tizen/20200423.063253~38 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=494a50e8f2ddb187b58f27c8c38e4c9db8c1675b;p=platform%2Fcore%2Fapi%2Fmediavision.git mv_inference: Place OpenCV dependent code from inference-engine-vision This patch moves OpenCV dependent code from inference-engine-vision layer to Inference layer. Now we can remove all inference-engine-vision relevent files - inference_engine_vision_impl.cpp and inference_engine_vision_impl.h Change-Id: I7750a4b8b85aba8a2d220fb4ee88f7007d6d5939 Signed-off-by: Inki Dae --- diff --git a/mv_inference/inference/include/Inference.h b/mv_inference/inference/include/Inference.h index 223ad155..4c6a4f6e 100755 --- a/mv_inference/inference/include/Inference.h +++ b/mv_inference/inference/include/Inference.h @@ -24,6 +24,8 @@ #include "inference_engine_error.h" #include "inference_engine_vision_impl.h" #include +#include +#include /** * @file Inference.h @@ -31,6 +33,33 @@ * provides inference interface. */ using namespace InferenceEngineInterface::Vision; + +typedef struct _ImageClassficationResults { + int number_of_classes; + std::vector indices; + std::vector names; + std::vector confidences; +} ImageClassificationResults; /**< structure ImageClassificationResults */ + +typedef struct _ObjectDetectionResults { + int number_of_objects; + std::vector indices; + std::vector names; + std::vector confidences; + std::vector locations; +} ObjectDetectionResults; /**< structure ObjectDetectionResults */ + +typedef struct _FaceDetectionResults { + int number_of_faces; + std::vector confidences; + std::vector locations; +} FaceDetectionResults; /**< structure FaceDetectionResults */ + +typedef struct _FacialLandMarkDetectionResults { + int number_of_landmarks; + std::vector locations; +} FacialLandMarkDetectionResults; /**< structure FacialLandMarkDetectionResults */ + namespace mediavision { namespace inference { @@ -262,20 +291,33 @@ public: private: bool mCanRun; /**< The flag indicating ready to run Inference */ + InferenceConfig mConfig; + inference_engine_capacity mBackendCapacity; + std::map> mSupportedInferenceBackend; + cv::Size mInputSize; + int mCh; + int mDim; + double mDeviation; + double mMean; + double mThreshold; + int mOutputNumbers; + cv::Size mSourceSize; + cv::Mat mInputBuffer; + int mMatType; - InferenceConfig mConfig; mv_engine_config_h engine_config; - inference_engine_capacity mBackendCapacity; InferenceEngineVision * mBackend; - std::map> mSupportedInferenceBackend; std::map mModelFormats; + std::vector mUserListName; private: void CheckSupportedInferenceBackend(); int ConvertEngineErrorToVisionError(int error); int ConvertTargetTypes(int given_types); + int DoPreprocess(cv::Mat cvImg); + int SetUserFile(std::string filename); }; } /* Inference */ diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp index ee72a6d6..4ef3f66f 100755 --- a/mv_inference/inference/src/Inference.cpp +++ b/mv_inference/inference/src/Inference.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10 @@ -30,6 +31,16 @@ #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0 #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0 +typedef enum { + InputAttrNoType = 0, + InputAttrFloat32 = 1, + InputAttrInt32 = 2, + InputAttrUInt8 = 3, + InputAttrInt64 = 4, + InputAttrString = 5, + InputAttrBool = 6, +} InputAttrType; + namespace mediavision { namespace inference { InferenceConfig::InferenceConfig() : @@ -53,7 +64,17 @@ Inference::Inference() : mCanRun(), mConfig(), mBackendCapacity(), - mSupportedInferenceBackend() + mSupportedInferenceBackend(), + mInputSize(cv::Size()), + mCh(0), + mDim(0), + mDeviation(0.0), + mMean(0.0), + mThreshold(0.0), + mOutputNumbers(0), + mSourceSize(cv::Size()), + mInputBuffer(cv::Mat()), + mMatType(0) { LOGI("ENTER"); @@ -178,6 +199,68 @@ int Inference::ConvertTargetTypes(int given_types) return target_types; } +int Inference::DoPreprocess(cv::Mat cvImg) +{ + mSourceSize = cvImg.size(); + int width = mInputSize.width; + int height = mInputSize.height; + + cv::Mat sample; + if (cvImg.channels() == 3 && mCh == 1) + cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY); + else + sample = cvImg; + + // size + cv::Mat sampleResized; + if (sample.size() != cv::Size(width, height)) + cv::resize(sample, sampleResized, cv::Size(width, height)); + else + sampleResized = sample; + + // type + cv::Mat sampleFloat; + if (mCh == 3) + sampleResized.convertTo(sampleFloat, CV_32FC3); + else + sampleResized.convertTo(sampleFloat, CV_32FC1); + + // normalize + cv::Mat sampleNormalized; + cv::Mat meanMat; + if (mCh == 3) + meanMat = cv::Mat(sampleFloat.size(), CV_32FC3, cv::Scalar((float)mMean, (float)mMean, (float)mMean)); + else + meanMat = cv::Mat(sampleFloat.size(), CV_32FC1, cv::Scalar((float)mMean)); + + cv::subtract(sampleFloat, meanMat, sampleNormalized); + + sampleNormalized /= (float)mDeviation; + + sampleNormalized.convertTo(mInputBuffer, mMatType); + + return MEDIA_VISION_ERROR_NONE; +} + +int Inference::SetUserFile(std::string filename) +{ + std::ifstream fp(filename.c_str()); + if (!fp.is_open()) { + return MEDIA_VISION_ERROR_INVALID_PATH; + } + + std::string userListName; + while (!fp.eof()) { + std::getline(fp, userListName); + if (userListName.length()) + mUserListName.push_back(userListName); + } + + fp.close(); + + return MEDIA_VISION_ERROR_NONE; +} + void Inference::ConfigureModelFiles(const std::string modelConfigFilePath, const std::string modelWeightFilePath, const std::string modelUserFilePath) @@ -194,7 +277,7 @@ void Inference::ConfigureTensorInfo(int width, double stdValue, double meanValue) { - mConfig.mTensorInfo = {width, height, dim, ch}; + mConfig.mTensorInfo = {width, height, dim, ch}; mConfig.mStdValue = stdValue; mConfig.mMeanValue = meanValue; } @@ -298,20 +381,22 @@ int Inference::Prepare(void) { LOGI("ENTER"); - // Input Tensor Param - mBackend->SetInputTensorParamInput(mConfig.mTensorInfo.width, - mConfig.mTensorInfo.height, - mConfig.mTensorInfo.dim, - mConfig.mTensorInfo.ch); + mCh = mConfig.mTensorInfo.ch; + mDim = mConfig.mTensorInfo.dim; + mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height); + LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height); - mBackend->SetInputTensorParamNorm(mConfig.mStdValue, mConfig.mMeanValue); + mDeviation = mConfig.mStdValue; + mMean = mConfig.mMeanValue; + LOGI("mean %.4f, deviation %.4f", mMean, mDeviation); mBackend->SetInputTensorParamNode(mConfig.mInputNodeName); - // Output Tensor Param - mBackend->SetOutputTensorParamNumbers(mConfig.mMaxOutputNumbers); + mOutputNumbers = mConfig.mMaxOutputNumbers; + LOGI("outputNumber %d", mOutputNumbers); - mBackend->SetOutputTensorParamThresHold(mConfig.mConfidenceThresHold); + mThreshold = mConfig.mConfidenceThresHold; + LOGI("threshold %.4f", mThreshold); mBackend->SetOutputTensorParamNodes(mConfig.mOutputNodeNames); @@ -348,6 +433,19 @@ int Inference::Load(void) { LOGI("ENTER"); + std::string label_file = mConfig.mUserFilePath; + size_t userFileLength = label_file.length(); + if (userFileLength > 0 && access(label_file.c_str(), F_OK)) { + LOGE("Label file path in [%s] ", label_file.c_str()); + return MEDIA_VISION_ERROR_INVALID_PARAMETER; + } + + int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE; + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to load label file."); + return ret; + } + // Check if model file is valid or not. std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1); std::map::iterator key = mModelFormats.find(ext_str); @@ -378,19 +476,73 @@ int Inference::Load(void) break; } - models.push_back(mConfig.mUserFilePath); - // Request model loading to backend engine. - int ret = mBackend->Load(models, (inference_model_format_e)key->second); + ret = mBackend->Load(models, (inference_model_format_e)key->second); if (ret != INFERENCE_ENGINE_ERROR_NONE) { delete mBackend; LOGE("Fail to load model"); mCanRun = false; - goto out; + std::vector().swap(models); + return ConvertEngineErrorToVisionError(ret); } + //get type and allocate memory to mInputBuffer; + InputAttrType attrType = static_cast(mBackend->GetInputLayerAttrType()); + if (attrType == InputAttrUInt8) { + LOGI("InputType is %d ch with UINT8", mCh); + if (mCh == 1) { + mMatType = CV_8UC1; + } else if (mCh == 3) { + mMatType = CV_8UC3; + } else { + LOGE("Not supported"); + std::vector().swap(models); + return ConvertEngineErrorToVisionError(ret);; + } + } + else if (attrType == InputAttrFloat32) { + LOGI("InputType is %d ch with FLOAT32", mCh); + if (mCh == 1) { + mMatType = CV_32FC1; + } else if (mCh == 3) { + mMatType = CV_32FC3; + } else { + LOGE("Not supported"); + std::vector().swap(models); + return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT; + } + } + else { + LOGE("Not supported"); + std::vector().swap(models); + return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT; + } + + tensor_t inputData; + std::vector info{1, mMatType, mInputSize.height, mInputSize.width}; + inputData.dimInfo.push_back(info); + + // some plug-in (opencv) doesn't allocate memory for input while loading models + // But, others (tflite) allcate memory while loading. + // Thus, the SetInputData() will be implemented in plug-in such as OpenCV, but + // just leave empty in plug-in such as tflite. + ret = mBackend->SetInputDataBuffer(inputData); + if (ret != INFERENCE_ENGINE_ERROR_NONE) { + LOGE("Fail to SetInputData"); + return ConvertEngineErrorToVisionError(ret);; + } + + void *dataPtr = mBackend->GetInputDataPtr(); + if (dataPtr == nullptr) { + LOGE("input data address is null"); + std::vector().swap(models); + return MEDIA_VISION_ERROR_INTERNAL; + } + + mInputBuffer = cv::Mat(mInputSize.height, mInputSize.width, mMatType, dataPtr); + mCanRun = true; -out: + std::vector().swap(models); LOGI("LEAVE"); @@ -438,7 +590,15 @@ int Inference::Run(mv_source_h mvSource, mv_rectangle_s *roi) } LOGE("Size: w:%d, h:%d", cvSource.size().width, cvSource.size().height); - ret = mBackend->Run(cvSource); + + // Convert color space of input tensor data and then normalize it. + ret = DoPreprocess(cvSource); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to preprocess input tensor data."); + return ret; + } + + ret = mBackend->Run(); return ConvertEngineErrorToVisionError(ret); } @@ -450,13 +610,63 @@ std::pair Inference::GetSupportedInferenceBackend(int backend int Inference::GetClassficationResults(ImageClassificationResults *classificationResults) { - ImageClassificationResults results; - int ret = mBackend->GetInferenceResult(results); + tensor_t outputData; + int ret = mBackend->GetInferenceResult(outputData); if (ret != INFERENCE_ENGINE_ERROR_NONE) { LOGE("Fail to GetClassificationResults"); return ConvertEngineErrorToVisionError(ret); } + // Will contain top N results in ascending order. + std::vector> top_results; + std::priority_queue, + std::vector>, + std::greater>> top_result_pq; + float value; + + std::vector> inferDimInfo(outputData.dimInfo); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); + + long count = inferDimInfo[0][1]; + LOGI("count: %ld", count); + + float *prediction = reinterpret_cast(inferResults[0]); + for (int i = 0; i < count; ++i) { + value = prediction[i]; + // Only add it if it beats the threshold and has a chance at being in + // the top N. + top_result_pq.push(std::pair(value, i)); + + // If at capacity, kick the smallest value out. + if (top_result_pq.size() > mOutputNumbers) { + top_result_pq.pop(); + } + } + + // Copy to output vector and reverse into descending order. + while (!top_result_pq.empty()) { + top_results.push_back(top_result_pq.top()); + top_result_pq.pop(); + } + std::reverse(top_results.begin(), top_results.end()); + + int classIdx = -1; + ImageClassificationResults results; + results.number_of_classes = 0; + for (int idx = 0; idx < top_results.size(); ++idx) { + if (top_results[idx].first < mThreshold) + continue; + LOGI("idx:%d", idx); + LOGI("classIdx: %d", top_results[idx].second); + LOGI("classProb: %f", top_results[idx].first); + + classIdx = top_results[idx].second; + results.indices.push_back(classIdx); + results.confidences.push_back(top_results[idx].first); + results.names.push_back(mUserListName[classIdx]); + results.number_of_classes++; + } + *classificationResults = results; LOGE("Inference: GetClassificationResults: %d\n", results.number_of_classes); return MEDIA_VISION_ERROR_NONE; @@ -464,13 +674,51 @@ int Inference::GetClassficationResults(ImageClassificationResults *classificatio int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResults) { - ObjectDetectionResults results; - int ret = mBackend->GetInferenceResult(results); + tensor_t outputData; + int ret = mBackend->GetInferenceResult(outputData); if (ret != INFERENCE_ENGINE_ERROR_NONE) { LOGE("Fail to GetObjectDetectionResults"); return ConvertEngineErrorToVisionError(ret); } + std::vector> inferDimInfo(outputData.dimInfo); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); + + float* boxes = reinterpret_cast(inferResults[0]); + float* classes = reinterpret_cast(inferResults[1]); + float* scores = reinterpret_cast(inferResults[2]); + int number_of_detections = (int)(*reinterpret_cast(inferResults[3])); + + int left, top, right, bottom; + cv::Rect loc; + + ObjectDetectionResults results; + results.number_of_objects = 0; + for (int idx = 0; idx < number_of_detections; ++idx) { + if (scores[idx] < mThreshold) + continue; + + left = (int)(boxes[idx*4 + 1] * mSourceSize.width); + top = (int)(boxes[idx*4 + 0] * mSourceSize.height); + right = (int)(boxes[idx*4 + 3] * mSourceSize.width); + bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height); + + loc.x = left; + loc.y = top; + loc.width = right -left + 1; + loc.height = bottom - top + 1; + + results.indices.push_back((int)classes[idx]); + results.confidences.push_back(scores[idx]); + results.names.push_back(mUserListName[(int)classes[idx]]); + results.locations.push_back(loc); + results.number_of_objects++; + + LOGI("objectClass: %d", (int)classes[idx]); + LOGI("confidence:%f", scores[idx]); + LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom); + } + *detectionResults = results; LOGE("Inference: GetObjectDetectionResults: %d\n", results.number_of_objects); return MEDIA_VISION_ERROR_NONE; @@ -478,8 +726,49 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults) { + tensor_t outputData; + int ret = mBackend->GetInferenceResult(outputData); + if (ret != INFERENCE_ENGINE_ERROR_NONE) { + LOGE("Fail to GetFaceDetectionResults"); + return ConvertEngineErrorToVisionError(ret); + } + + std::vector> inferDimInfo(outputData.dimInfo); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); + + float* boxes = reinterpret_cast(inferResults[0]); + float* classes = reinterpret_cast(inferResults[1]); + float* scores = reinterpret_cast(inferResults[2]); + + int number_of_detections = (int)(*reinterpret_cast(inferResults[3])); + int left, top, right, bottom; + cv::Rect loc; + FaceDetectionResults results; - mBackend->GetInferenceResult(results); + results.number_of_faces = 0; + for (int idx = 0; idx < number_of_detections; ++idx) { + if (scores[idx] < mThreshold) + continue; + + left = (int)(boxes[idx*4 + 1] * mSourceSize.width); + top = (int)(boxes[idx*4 + 0] * mSourceSize.height); + right = (int)(boxes[idx*4 + 3] * mSourceSize.width); + bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height); + + loc.x = left; + loc.y = top; + loc.width = right -left + 1; + loc.height = bottom - top + 1; + + results.confidences.push_back(scores[idx]); + results.locations.push_back(loc); + results.number_of_faces++; + + LOGI("confidence:%f", scores[idx]); + LOGI("class: %f", classes[idx]); + LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx*4 + 1], boxes[idx*4 + 0], boxes[idx*4 + 3], boxes[idx*4 + 2]); + LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom); + } *detectionResults = results; LOGE("Inference: GetFaceDetectionResults: %d\n", results.number_of_faces); @@ -488,8 +777,34 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults) int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *detectionResults) { + tensor_t outputData; + int ret = mBackend->GetInferenceResult(outputData); + if (ret != INFERENCE_ENGINE_ERROR_NONE) { + LOGE("Fail to GetFacialLandMarkDetectionResults"); + return ConvertEngineErrorToVisionError(ret); + } + + std::vector> inferDimInfo(outputData.dimInfo); + std::vector inferResults(outputData.data.begin(), outputData.data.end()); + + long number_of_detections = inferDimInfo[0][1]; + float* loc = reinterpret_cast(inferResults[0]); + FacialLandMarkDetectionResults results; - mBackend->GetInferenceResult(results); + results.number_of_landmarks = 0; + + cv::Point point(0,0); + results.number_of_landmarks = 0; + LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height); + for (int idx = 0; idx < number_of_detections; idx+=2) { + point.x = (int)(loc[idx] * mSourceSize.width); + point.y = (int)(loc[idx+1] * mSourceSize.height); + + results.locations.push_back(point); + results.number_of_landmarks++; + + LOGI("x:%d, y:%d", point.x, point.y); + } *detectionResults = results; LOGE("Inference: FacialLandmarkDetectionResults: %d\n", results.number_of_landmarks);