#include <unistd.h>
#include <fstream>
#include <string>
+#include <queue>
#include <algorithm>
#define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
#define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
#define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
+typedef enum {
+ InputAttrNoType = 0,
+ InputAttrFloat32 = 1,
+ InputAttrInt32 = 2,
+ InputAttrUInt8 = 3,
+ InputAttrInt64 = 4,
+ InputAttrString = 5,
+ InputAttrBool = 6,
+} InputAttrType;
+
namespace mediavision {
namespace inference {
InferenceConfig::InferenceConfig() :
mCanRun(),
mConfig(),
mBackendCapacity(),
- mSupportedInferenceBackend()
+ mSupportedInferenceBackend(),
+ mInputSize(cv::Size()),
+ mCh(0),
+ mDim(0),
+ mDeviation(0.0),
+ mMean(0.0),
+ mThreshold(0.0),
+ mOutputNumbers(0),
+ mSourceSize(cv::Size()),
+ mInputBuffer(cv::Mat()),
+ mMatType(0)
{
LOGI("ENTER");
return target_types;
}
+int Inference::DoPreprocess(cv::Mat cvImg)
+{
+ mSourceSize = cvImg.size();
+ int width = mInputSize.width;
+ int height = mInputSize.height;
+
+ cv::Mat sample;
+ if (cvImg.channels() == 3 && mCh == 1)
+ cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY);
+ else
+ sample = cvImg;
+
+ // size
+ cv::Mat sampleResized;
+ if (sample.size() != cv::Size(width, height))
+ cv::resize(sample, sampleResized, cv::Size(width, height));
+ else
+ sampleResized = sample;
+
+ // type
+ cv::Mat sampleFloat;
+ if (mCh == 3)
+ sampleResized.convertTo(sampleFloat, CV_32FC3);
+ else
+ sampleResized.convertTo(sampleFloat, CV_32FC1);
+
+ // normalize
+ cv::Mat sampleNormalized;
+ cv::Mat meanMat;
+ if (mCh == 3)
+ meanMat = cv::Mat(sampleFloat.size(), CV_32FC3, cv::Scalar((float)mMean, (float)mMean, (float)mMean));
+ else
+ meanMat = cv::Mat(sampleFloat.size(), CV_32FC1, cv::Scalar((float)mMean));
+
+ cv::subtract(sampleFloat, meanMat, sampleNormalized);
+
+ sampleNormalized /= (float)mDeviation;
+
+ sampleNormalized.convertTo(mInputBuffer, mMatType);
+
+ return MEDIA_VISION_ERROR_NONE;
+}
+
+int Inference::SetUserFile(std::string filename)
+{
+ std::ifstream fp(filename.c_str());
+ if (!fp.is_open()) {
+ return MEDIA_VISION_ERROR_INVALID_PATH;
+ }
+
+ std::string userListName;
+ while (!fp.eof()) {
+ std::getline(fp, userListName);
+ if (userListName.length())
+ mUserListName.push_back(userListName);
+ }
+
+ fp.close();
+
+ return MEDIA_VISION_ERROR_NONE;
+}
+
void Inference::ConfigureModelFiles(const std::string modelConfigFilePath,
const std::string modelWeightFilePath,
const std::string modelUserFilePath)
double stdValue,
double meanValue)
{
- mConfig.mTensorInfo = {width, height, dim, ch};
+ mConfig.mTensorInfo = {width, height, dim, ch};
mConfig.mStdValue = stdValue;
mConfig.mMeanValue = meanValue;
}
{
LOGI("ENTER");
- // Input Tensor Param
- mBackend->SetInputTensorParamInput(mConfig.mTensorInfo.width,
- mConfig.mTensorInfo.height,
- mConfig.mTensorInfo.dim,
- mConfig.mTensorInfo.ch);
+ mCh = mConfig.mTensorInfo.ch;
+ mDim = mConfig.mTensorInfo.dim;
+ mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
+ LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
- mBackend->SetInputTensorParamNorm(mConfig.mStdValue, mConfig.mMeanValue);
+ mDeviation = mConfig.mStdValue;
+ mMean = mConfig.mMeanValue;
+ LOGI("mean %.4f, deviation %.4f", mMean, mDeviation);
mBackend->SetInputTensorParamNode(mConfig.mInputNodeName);
- // Output Tensor Param
- mBackend->SetOutputTensorParamNumbers(mConfig.mMaxOutputNumbers);
+ mOutputNumbers = mConfig.mMaxOutputNumbers;
+ LOGI("outputNumber %d", mOutputNumbers);
- mBackend->SetOutputTensorParamThresHold(mConfig.mConfidenceThresHold);
+ mThreshold = mConfig.mConfidenceThresHold;
+ LOGI("threshold %.4f", mThreshold);
mBackend->SetOutputTensorParamNodes(mConfig.mOutputNodeNames);
{
LOGI("ENTER");
+ std::string label_file = mConfig.mUserFilePath;
+ size_t userFileLength = label_file.length();
+ if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
+ LOGE("Label file path in [%s] ", label_file.c_str());
+ return MEDIA_VISION_ERROR_INVALID_PARAMETER;
+ }
+
+ int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
+ if (ret != MEDIA_VISION_ERROR_NONE) {
+ LOGE("Fail to load label file.");
+ return ret;
+ }
+
// Check if model file is valid or not.
std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
break;
}
- models.push_back(mConfig.mUserFilePath);
-
// Request model loading to backend engine.
- int ret = mBackend->Load(models, (inference_model_format_e)key->second);
+ ret = mBackend->Load(models, (inference_model_format_e)key->second);
if (ret != INFERENCE_ENGINE_ERROR_NONE) {
delete mBackend;
LOGE("Fail to load model");
mCanRun = false;
- goto out;
+ std::vector<std::string>().swap(models);
+ return ConvertEngineErrorToVisionError(ret);
}
+ //get type and allocate memory to mInputBuffer;
+ InputAttrType attrType = static_cast<InputAttrType>(mBackend->GetInputLayerAttrType());
+ if (attrType == InputAttrUInt8) {
+ LOGI("InputType is %d ch with UINT8", mCh);
+ if (mCh == 1) {
+ mMatType = CV_8UC1;
+ } else if (mCh == 3) {
+ mMatType = CV_8UC3;
+ } else {
+ LOGE("Not supported");
+ std::vector<std::string>().swap(models);
+ return ConvertEngineErrorToVisionError(ret);;
+ }
+ }
+ else if (attrType == InputAttrFloat32) {
+ LOGI("InputType is %d ch with FLOAT32", mCh);
+ if (mCh == 1) {
+ mMatType = CV_32FC1;
+ } else if (mCh == 3) {
+ mMatType = CV_32FC3;
+ } else {
+ LOGE("Not supported");
+ std::vector<std::string>().swap(models);
+ return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+ }
+ }
+ else {
+ LOGE("Not supported");
+ std::vector<std::string>().swap(models);
+ return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+ }
+
+ tensor_t inputData;
+ std::vector<int> info{1, mMatType, mInputSize.height, mInputSize.width};
+ inputData.dimInfo.push_back(info);
+
+ // some plug-in (opencv) doesn't allocate memory for input while loading models
+ // But, others (tflite) allcate memory while loading.
+ // Thus, the SetInputData() will be implemented in plug-in such as OpenCV, but
+ // just leave empty in plug-in such as tflite.
+ ret = mBackend->SetInputDataBuffer(inputData);
+ if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+ LOGE("Fail to SetInputData");
+ return ConvertEngineErrorToVisionError(ret);;
+ }
+
+ void *dataPtr = mBackend->GetInputDataPtr();
+ if (dataPtr == nullptr) {
+ LOGE("input data address is null");
+ std::vector<std::string>().swap(models);
+ return MEDIA_VISION_ERROR_INTERNAL;
+ }
+
+ mInputBuffer = cv::Mat(mInputSize.height, mInputSize.width, mMatType, dataPtr);
+
mCanRun = true;
-out:
+
std::vector<std::string>().swap(models);
LOGI("LEAVE");
}
LOGE("Size: w:%d, h:%d", cvSource.size().width, cvSource.size().height);
- ret = mBackend->Run(cvSource);
+
+ // Convert color space of input tensor data and then normalize it.
+ ret = DoPreprocess(cvSource);
+ if (ret != MEDIA_VISION_ERROR_NONE) {
+ LOGE("Fail to preprocess input tensor data.");
+ return ret;
+ }
+
+ ret = mBackend->Run();
return ConvertEngineErrorToVisionError(ret);
}
int Inference::GetClassficationResults(ImageClassificationResults *classificationResults)
{
- ImageClassificationResults results;
- int ret = mBackend->GetInferenceResult(results);
+ tensor_t outputData;
+ int ret = mBackend->GetInferenceResult(outputData);
if (ret != INFERENCE_ENGINE_ERROR_NONE) {
LOGE("Fail to GetClassificationResults");
return ConvertEngineErrorToVisionError(ret);
}
+ // Will contain top N results in ascending order.
+ std::vector<std::pair<float, int>> top_results;
+ std::priority_queue<std::pair<float, int>,
+ std::vector<std::pair<float, int>>,
+ std::greater<std::pair<float, int>>> top_result_pq;
+ float value;
+
+ std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+ std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+ long count = inferDimInfo[0][1];
+ LOGI("count: %ld", count);
+
+ float *prediction = reinterpret_cast<float*>(inferResults[0]);
+ for (int i = 0; i < count; ++i) {
+ value = prediction[i];
+ // Only add it if it beats the threshold and has a chance at being in
+ // the top N.
+ top_result_pq.push(std::pair<float, int>(value, i));
+
+ // If at capacity, kick the smallest value out.
+ if (top_result_pq.size() > mOutputNumbers) {
+ top_result_pq.pop();
+ }
+ }
+
+ // Copy to output vector and reverse into descending order.
+ while (!top_result_pq.empty()) {
+ top_results.push_back(top_result_pq.top());
+ top_result_pq.pop();
+ }
+ std::reverse(top_results.begin(), top_results.end());
+
+ int classIdx = -1;
+ ImageClassificationResults results;
+ results.number_of_classes = 0;
+ for (int idx = 0; idx < top_results.size(); ++idx) {
+ if (top_results[idx].first < mThreshold)
+ continue;
+ LOGI("idx:%d", idx);
+ LOGI("classIdx: %d", top_results[idx].second);
+ LOGI("classProb: %f", top_results[idx].first);
+
+ classIdx = top_results[idx].second;
+ results.indices.push_back(classIdx);
+ results.confidences.push_back(top_results[idx].first);
+ results.names.push_back(mUserListName[classIdx]);
+ results.number_of_classes++;
+ }
+
*classificationResults = results;
LOGE("Inference: GetClassificationResults: %d\n", results.number_of_classes);
return MEDIA_VISION_ERROR_NONE;
int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResults)
{
- ObjectDetectionResults results;
- int ret = mBackend->GetInferenceResult(results);
+ tensor_t outputData;
+ int ret = mBackend->GetInferenceResult(outputData);
if (ret != INFERENCE_ENGINE_ERROR_NONE) {
LOGE("Fail to GetObjectDetectionResults");
return ConvertEngineErrorToVisionError(ret);
}
+ std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+ std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+ float* boxes = reinterpret_cast<float*>(inferResults[0]);
+ float* classes = reinterpret_cast<float*>(inferResults[1]);
+ float* scores = reinterpret_cast<float*>(inferResults[2]);
+ int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+
+ int left, top, right, bottom;
+ cv::Rect loc;
+
+ ObjectDetectionResults results;
+ results.number_of_objects = 0;
+ for (int idx = 0; idx < number_of_detections; ++idx) {
+ if (scores[idx] < mThreshold)
+ continue;
+
+ left = (int)(boxes[idx*4 + 1] * mSourceSize.width);
+ top = (int)(boxes[idx*4 + 0] * mSourceSize.height);
+ right = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+ bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+ loc.x = left;
+ loc.y = top;
+ loc.width = right -left + 1;
+ loc.height = bottom - top + 1;
+
+ results.indices.push_back((int)classes[idx]);
+ results.confidences.push_back(scores[idx]);
+ results.names.push_back(mUserListName[(int)classes[idx]]);
+ results.locations.push_back(loc);
+ results.number_of_objects++;
+
+ LOGI("objectClass: %d", (int)classes[idx]);
+ LOGI("confidence:%f", scores[idx]);
+ LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+ }
+
*detectionResults = results;
LOGE("Inference: GetObjectDetectionResults: %d\n", results.number_of_objects);
return MEDIA_VISION_ERROR_NONE;
int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
{
+ tensor_t outputData;
+ int ret = mBackend->GetInferenceResult(outputData);
+ if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+ LOGE("Fail to GetFaceDetectionResults");
+ return ConvertEngineErrorToVisionError(ret);
+ }
+
+ std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+ std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+ float* boxes = reinterpret_cast<float*>(inferResults[0]);
+ float* classes = reinterpret_cast<float*>(inferResults[1]);
+ float* scores = reinterpret_cast<float*>(inferResults[2]);
+
+ int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+ int left, top, right, bottom;
+ cv::Rect loc;
+
FaceDetectionResults results;
- mBackend->GetInferenceResult(results);
+ results.number_of_faces = 0;
+ for (int idx = 0; idx < number_of_detections; ++idx) {
+ if (scores[idx] < mThreshold)
+ continue;
+
+ left = (int)(boxes[idx*4 + 1] * mSourceSize.width);
+ top = (int)(boxes[idx*4 + 0] * mSourceSize.height);
+ right = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+ bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+ loc.x = left;
+ loc.y = top;
+ loc.width = right -left + 1;
+ loc.height = bottom - top + 1;
+
+ results.confidences.push_back(scores[idx]);
+ results.locations.push_back(loc);
+ results.number_of_faces++;
+
+ LOGI("confidence:%f", scores[idx]);
+ LOGI("class: %f", classes[idx]);
+ LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx*4 + 1], boxes[idx*4 + 0], boxes[idx*4 + 3], boxes[idx*4 + 2]);
+ LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+ }
*detectionResults = results;
LOGE("Inference: GetFaceDetectionResults: %d\n", results.number_of_faces);
int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *detectionResults)
{
+ tensor_t outputData;
+ int ret = mBackend->GetInferenceResult(outputData);
+ if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+ LOGE("Fail to GetFacialLandMarkDetectionResults");
+ return ConvertEngineErrorToVisionError(ret);
+ }
+
+ std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+ std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+ long number_of_detections = inferDimInfo[0][1];
+ float* loc = reinterpret_cast<float*>(inferResults[0]);
+
FacialLandMarkDetectionResults results;
- mBackend->GetInferenceResult(results);
+ results.number_of_landmarks = 0;
+
+ cv::Point point(0,0);
+ results.number_of_landmarks = 0;
+ LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
+ for (int idx = 0; idx < number_of_detections; idx+=2) {
+ point.x = (int)(loc[idx] * mSourceSize.width);
+ point.y = (int)(loc[idx+1] * mSourceSize.height);
+
+ results.locations.push_back(point);
+ results.number_of_landmarks++;
+
+ LOGI("x:%d, y:%d", point.x, point.y);
+ }
*detectionResults = results;
LOGE("Inference: FacialLandmarkDetectionResults: %d\n", results.number_of_landmarks);